This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import parse_file as dp | |
| #define paths to test files | |
| txt_path = 'test_txt.txt' | |
| docx_path = 'test_docx.docx' | |
| pdf_path = 'test_pdf.pdf' | |
| html_path = 'test_html.html' | |
| pptx_path = 'test_pptx.pptx' | |
| file_paths = [txt_path,docx_path,pdf_path,html_path,pptx_path] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import io | |
| from docx import Document | |
| from pdfminer3.layout import LAParams, LTTextBox | |
| from pdfminer3.pdfpage import PDFPage | |
| from pdfminer3.pdfinterp import PDFResourceManager | |
| from pdfminer3.pdfinterp import PDFPageInterpreter | |
| from pdfminer3.converter import PDFPageAggregator | |
| from pdfminer3.converter import TextConverter |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from docx import Document | |
| class DocParser: | |
| def parse(self,document): | |
| parser = get_format(document) | |
| return parser(document) | |
| def get_format(document): | |
| format = os.path.splitext(document)[-1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def serialise_file(document,format): | |
| if format =='txt': | |
| with open(document, 'r') as file: | |
| string = file.read().replace('\n', ' ') | |
| return string | |
| elif format == 'docx' | |
| #docx parsing code here | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| from textblob import TextBlob | |
| def sentiment_polarity(string: str) -> float: | |
| polarity = TextBlob(string).sentiment[0] | |
| return polarity |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| from textacy import extract, make_spacy_doc | |
| # Load the entire article text | |
| with open("news_article.txt", "r") as file: | |
| data = file.read().replace("\n", "") | |
| article = data.replace(u"\xa0", u" ") | |
| # Create doc object |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from itertools import count | |
| import matplotlib.pyplot as plt | |
| import networkx as nx | |
| import numpy as np | |
| import pandas as pd | |
| import textacy | |
| with open("news_article.txt", "r") as file: | |
| data = file.read().replace("\n", "") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import List, Tuple | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| from matplotlib.axes import Axes | |
| from textacy import extract, make_spacy_doc | |
| def decompose_keyterms(keyterm_list: List[str]) -> Tuple: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Scrape profile and get recent posts | |
| natgeo = Profile('natgeo') | |
| natgeo.scrape() | |
| recents = natgeo.get_recent_posts() | |
| #Filter list to separate images from videos | |
| recent_photos = [post for post in recents if not post.is_video] | |
| #Save photos in a loop | |
| for i, post in enumerate(recent_photos): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from instascrape import Hashtag | |
| #Substitute 'ad' with the word you | |
| #want to search for (as a string) | |
| hashtag = Hashtag('ad') | |
| #Scrape the profile | |
| hashtag.scrape() | |
| #Get list of the recent posts | |
| recents = hashtag.get_recent_posts() |
NewerOlder