Mike Mlawrence95

## goodreads_rss_to_yaml.py
import feedparser  # via conda install anaconda::feedparser
import yaml
from bs4 import BeautifulSoup

_GOODREADS_RSS_STREAM_URL = "https://www.goodreads.com/review/list_rss/<XXXXXXXXXX>?key=<XXXXXXXXXXXXXX>&shelf=<XXXX>"
# Old yaml lives here. We'll use it to ensure our new dump has unique values.
_EXISTING_YAML_PATH = "docs/_data/books.yml"
_NEW_YAML_PATH = "books.yaml"


## read_csv_from_aws_s3_targz.python
# checked against python 3.7.3, pandas 0.24.2, s3fs 0.4.2
import tarfile
import io
import s3fs

import pandas as pd

tar_path      = f"s3://my-bucket/debug.tar.gz"  # path in s3
metadata_path = "debug/metadata.csv"  # path inside of the tar file

## md5_decorator.py
import pandas as pd
from hashlib import md5

def text_to_hash(text):
    return md5(text.encode("utf8")).hexdigest()

def add_hash(column_name="document"):
    """
    Decorator. Wraps a function that returns a dataframe, must have column_name in columns.


## mp3_to_plot.py
import matplotlib.pyplot as plt
import soundfile as sf

from pydub import AudioSegment

# we want to convert source, mp3, into dest, a .wav file
source = "./recordings/test.mp3"
dest = "./recordings/test.wav"

# conversion - check!

## get_timestamp.py
import time

def get_timestamp():
    """
    Print the date in m/d/y format, GMT

    >>> get_timestamp()
    '3_31_2020'
    """
    t = time.gmtime()

## open_files.py
import json
import pickle

def openJSON(path):
    """
    Safely opens json file at 'path'
    """

    with open(path, 'r') as File:
        data = json.load(File)

## pyplot_set_params.py
import matplotlib.pyplot as plt

params = {'legend.fontsize': 'x-large',
          'figure.figsize':  (15, 15),
          'axes.labelsize':  'x-large',
          'axes.titlesize':  'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}

plt.rcParams.update(params)

## make_old_pickles_openable.py
import pickle
import dill

dill._dill._reverse_typemap['SliceType']  = slice
dill._dill._reverse_typemap['ObjectType'] = object

## clone_private_repo.txt
git clone https://[insert username]:[insert password]@github.com/[insert organisation name]/[insert repo name].git

## get_word_counts.py
import numpy as np
import pandas as pd

def get_word_counts(document: str) -> pd.DataFrame:
    """
    Turns a document into a dataframe of word, counts

    Use preprocessing/lowercasing before this step for best results.

    If passing many documents, use document = '\n'.join(iterable_of_documents)
	import feedparser # via conda install anaconda::feedparser
	import yaml
	from bs4 import BeautifulSoup

	_GOODREADS_RSS_STREAM_URL = "https://www.goodreads.com/review/list_rss/<XXXXXXXXXX>?key=<XXXXXXXXXXXXXX>&shelf=<XXXX>"
	# Old yaml lives here. We'll use it to ensure our new dump has unique values.
	_EXISTING_YAML_PATH = "docs/_data/books.yml"
	_NEW_YAML_PATH = "books.yaml"
	# checked against python 3.7.3, pandas 0.24.2, s3fs 0.4.2
	import tarfile
	import io
	import s3fs

	import pandas as pd

	tar_path = f"s3://my-bucket/debug.tar.gz" # path in s3
	metadata_path = "debug/metadata.csv" # path inside of the tar file
	import pandas as pd
	from hashlib import md5

	def text_to_hash(text):
	return md5(text.encode("utf8")).hexdigest()

	def add_hash(column_name="document"):
	"""
	Decorator. Wraps a function that returns a dataframe, must have column_name in columns.
	import matplotlib.pyplot as plt
	import soundfile as sf

	from pydub import AudioSegment

	# we want to convert source, mp3, into dest, a .wav file
	source = "./recordings/test.mp3"
	dest = "./recordings/test.wav"

	# conversion - check!
	import time

	def get_timestamp():
	"""
	Print the date in m/d/y format, GMT

	>>> get_timestamp()
	'3_31_2020'
	"""
	t = time.gmtime()
	import json
	import pickle

	def openJSON(path):
	"""
	Safely opens json file at 'path'
	"""

	with open(path, 'r') as File:
	data = json.load(File)
	import matplotlib.pyplot as plt

	params = {'legend.fontsize': 'x-large',
	'figure.figsize': (15, 15),
	'axes.labelsize': 'x-large',
	'axes.titlesize': 'x-large',
	'xtick.labelsize': 'x-large',
	'ytick.labelsize': 'x-large'}

	plt.rcParams.update(params)
	import pickle
	import dill

	dill._dill._reverse_typemap['SliceType'] = slice
	dill._dill._reverse_typemap['ObjectType'] = object
	import numpy as np
	import pandas as pd

	def get_word_counts(document: str) -> pd.DataFrame:
	"""
	Turns a document into a dataframe of word, counts

	Use preprocessing/lowercasing before this step for best results.

	If passing many documents, use document = '\n'.join(iterable_of_documents)