Aditya1001001/add_names_and_tickers.py

## add_names_and_tickers.py
patterns = [nlp.make_doc(name) for name in names]
matcher.add("COMPANY", patterns)

patterns = [nlp.make_doc(symbol) for symbol in data['Symbol']]
matcher.add("SYMBOL", patterns)

## clean_names.py
from cleanco import basename
combined_list['Cleaned Name'] = combined_list['Company Name'].apply(basename)
combined_list['Cleaned Name'] = combined_list['Cleaned Name'].apply(basename)

names = pd.concat([combined_list['Company Name'], combined_list['Cleaned Name']], ignore_index = True).drop_duplicates()

## combine_lists.py
combined_list = pd.concat([data, SP500], ignore_index = True).drop_duplicates()

## correct_name.py
name_corrections = {"A": "A-Mark", "Federal": "Federal-Mogul",
                    "Global": "Global-Tech Advanced Innovations",
                    "G": "G-III Apparel", "Heritage": "Heritage Crystal Clean",
                    "II": "II-VI", "Mid": "Microchip Technology",
                    "Pro":"Pro-Dex", "Perma":"Perma-Fix Environmental Services",
                    "Park": "Park-Ohio Holdings", "Bio": "Bio-Techne",
                    "ROBO": " ROBO Global Robotics and Automation Index ETF",
                    "United": "United-Guardian", "Uni":"Uni-Pixel",
                    "Popular" : "Banco Popular", "News": "News Corp",
                    }

names = [name_corrections[name] if name in name_corrections.keys() else name for name in names ]

## create_example_phrasematcher.py
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

## create_matcher.py
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

## deduplication_and_viz.py
from spacy import displacy
# displacy options
colors = {"COMPANY": "#F67DE3", "SYMBOL": "#7DF6D9"}
options = {"colors": colors}

plot_data = {
        "text": doc.text,
        "ents": [],
        "title": None
    }

matches_with_dup = {"COMPANY":{}, "SYMBOL": {}}
for match_id, span_start, span_end in matches:

    rule_id = nlp.vocab.strings[match_id]
    text = doc[span_start: span_end].text
    start_idx = doc.text.index(doc[span_start].text)
    end_idx = start_idx + len(text)
    matches_with_dup[rule_id][text] = {"start": start_idx, "end": end_idx, "label": rule_id}

# substring names will appear multiple times but the expanded
# names will appear only once
for ent_type in matches_with_dup.keys():
  matches = matches_with_dup[ent_type]
  keys = matches.keys()
  counts = {text:0 for text in keys}
  for text in keys:
    for key in keys:
      if text in key:
        counts[text] += 1
  for text, count in counts.items():
    if count == 1:
      plot_data['ents'].append(matches[text])

#sort the matches by start index
plot_data['ents'] = sorted(plot_data['ents'], key=lambda ent: ent["start"])
displacy.render(plot_data , style="ent", options=options, manual=True, jupyter =True)

## example_patterns.py
# List of Patterns To Match For
phrases = ["Sergio Mattarella", "Mario Draghi", "president", "prime minister"]

# Create Doc Objects For The Phrases
patterns = [nlp(text) for text in phrases ]
matcher.add("PatternList", patterns)

## get_list.sh
!wget https://datahub.io/core/nasdaq-listings/r/nasdaq-listed-symbols.csv

## get_wiki_page.py
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response=requests.get(url)

## html_to_pandas.py
df=pd.read_html(str(table))
# convert list to dataframe
df=pd.DataFrame(df[0])

## install.sh
pip install cleanco requests beautifulsoup4

## load_nasdaq_list.py
import pandas as pd
data = pd.read_csv("nasdaq-listed-symbols.csv")

## match_lower_example.py
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

name = ["Sergio Mattarella", "Mario Draghi"]

# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(name) for name in names]
matcher.add("Names", patterns)

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

## remove_empty_entry.py
names = [name for name in cleaned_names if name != " " and len(name) > 0]

## remove_parenthesis.py
def remove_parenthesis(name):
    if "(" in name:
      l_paren_idx = name.index("(")
      r_paren_idx = name.index(")")
      return name[: l_paren_idx] + name[r_paren_idx + 1 :]
    else:
      return name

combined_list['Company Name'] = combined_list['Company Name'].apply(remove_parenthesis)

## rename_column.py
SP500 = df[['Symbol', 'Security']]
SP500 = SP500.rename(columns={"Security": "Company Name"})

## select_table.py
soup = BeautifulSoup(response.text, 'html.parser')
table=soup.find('table',{'class':"wikitable"})

## test_example_matcher.py
doc = nlp("A joint session of Italian parliament and some regional delegates, \
known as “great electors,” began a secret ballot on Monday to elect the next \
Italian president to replace the current officeholder, Sergio Mattarella. \
It is a focus of special attention because a top contender for the job is \
the prime minister, Mario Draghi, a titan of Europe who in just a year in \
power has stabilized Italy’s politics and initiated long-overdue overhauls.")

# Find Matches
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

## test_matcher.py
Text = "Microsoft (MSFT) dipped 2.4% after announcing the software giant will \
buy video game company Activision Blizzard, Inc (ATVI) in an all-cash transaction \
valued at $68.7 billion. \nThe shortened trading week will feature quarterly \
reports from 35 companies in the S&P 500, including Bank of America (BAC), \
UnitedHealth Group(UNH), and Netflix (NFLX). General Motors (GM) said it \
will invest roughly $6.6 billion in its home state of Michigan through \
2024. GM has projected it will overtake Tesla (TSLA) as the \
top U.S.-based seller of electric vehicles by mid-decade. Retailer Gap (GPS) \
shares fell 6.7% after Morgan Stanley downgraded the retailer."


doc = nlp(Text)
matches = matcher(doc)
for match_id, start, end in matches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COMPANY'
    span = doc[start : end]  # get the matched slice of the doc
    print(rule_id, span.text)
	patterns = [nlp.make_doc(name) for name in names]
	matcher.add("COMPANY", patterns)

	patterns = [nlp.make_doc(symbol) for symbol in data['Symbol']]
	matcher.add("SYMBOL", patterns)
	from cleanco import basename
	combined_list['Cleaned Name'] = combined_list['Company Name'].apply(basename)
	combined_list['Cleaned Name'] = combined_list['Cleaned Name'].apply(basename)

	names = pd.concat([combined_list['Company Name'], combined_list['Cleaned Name']], ignore_index = True).drop_duplicates()
	name_corrections = {"A": "A-Mark", "Federal": "Federal-Mogul",
	"Global": "Global-Tech Advanced Innovations",
	"G": "G-III Apparel", "Heritage": "Heritage Crystal Clean",
	"II": "II-VI", "Mid": "Microchip Technology",
	"Pro":"Pro-Dex", "Perma":"Perma-Fix Environmental Services",
	"Park": "Park-Ohio Holdings", "Bio": "Bio-Techne",
	"ROBO": " ROBO Global Robotics and Automation Index ETF",
	"United": "United-Guardian", "Uni":"Uni-Pixel",
	"Popular" : "Banco Popular", "News": "News Corp",
	}

	names = [name_corrections[name] if name in name_corrections.keys() else name for name in names ]
	import spacy
	from spacy.matcher import PhraseMatcher

	nlp = spacy.load("en_core_web_sm")
	matcher = PhraseMatcher(nlp.vocab)
	from spacy import displacy
	# displacy options
	colors = {"COMPANY": "#F67DE3", "SYMBOL": "#7DF6D9"}
	options = {"colors": colors}

	plot_data = {
	"text": doc.text,
	"ents": [],
	"title": None
	}

	matches_with_dup = {"COMPANY":{}, "SYMBOL": {}}
	for match_id, span_start, span_end in matches:

	rule_id = nlp.vocab.strings[match_id]
	text = doc[span_start: span_end].text
	start_idx = doc.text.index(doc[span_start].text)
	end_idx = start_idx + len(text)
	matches_with_dup[rule_id][text] = {"start": start_idx, "end": end_idx, "label": rule_id}

	# substring names will appear multiple times but the expanded
	# names will appear only once
	for ent_type in matches_with_dup.keys():
	matches = matches_with_dup[ent_type]
	keys = matches.keys()
	counts = {text:0 for text in keys}
	for text in keys:
	for key in keys:
	if text in key:
	counts[text] += 1
	for text, count in counts.items():
	if count == 1:
	plot_data['ents'].append(matches[text])

	#sort the matches by start index
	plot_data['ents'] = sorted(plot_data['ents'], key=lambda ent: ent["start"])
	displacy.render(plot_data , style="ent", options=options, manual=True, jupyter =True)
	# List of Patterns To Match For
	phrases = ["Sergio Mattarella", "Mario Draghi", "president", "prime minister"]

	# Create Doc Objects For The Phrases
	patterns = [nlp(text) for text in phrases ]
	matcher.add("PatternList", patterns)
	import requests # library to handle requests
	from bs4 import BeautifulSoup # library to parse HTML documents
	url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
	response=requests.get(url)
	df=pd.read_html(str(table))
	# convert list to dataframe
	df=pd.DataFrame(df[0])
	import pandas as pd
	data = pd.read_csv("nasdaq-listed-symbols.csv")
	matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

	name = ["Sergio Mattarella", "Mario Draghi"]

	# Only run nlp.make_doc to speed things up
	patterns = [nlp.make_doc(name) for name in names]
	matcher.add("Names", patterns)

	matches = matcher(doc)
	for match_id, start, end in matches:
	span = doc[start:end]
	print(span.text)