Last active
December 16, 2021 16:32
-
-
Save Aditya1001001/f8a6fdd1fc6b8a10e763303090931a7e to your computer and use it in GitHub Desktop.
Mining Financial Stock News Using SpaCy Matcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def dividend_info(article): | |
| headline = nlp(article['title']) | |
| if 'date' in [token.text.lower() for token in headline]: | |
| date = get_date(headline) | |
| if date: | |
| org = get_org(headline) | |
| ticker = get_ticker(headline) | |
| amount = get_amount_summary(nlp(article['summary'])) | |
| pay_date = get_pay_date(nlp(article['summary'])) | |
| print("HEADLINE: " + article['title']) | |
| print(f"\nTICKER: {ticker}" + f"\nDATE: {date}" + f"\nAMOUNT: {amount} per share to be paid on {pay_date}\n") | |
| else: | |
| dividend = get_amount_headline(headline) | |
| if dividend: | |
| org = get_org(headline) | |
| ticker = get_ticker(headline) | |
| print("NEWS HEADLINE: " + article['title']) | |
| print(f"\nTICKER: {ticker}" + f"\nAMOUNT: {dividend}\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| for article in news_articles: | |
| dividend_info(article) | |
| time.sleep(0.2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_date(doc): | |
| date_matcher = Matcher(nlp.vocab) | |
| pattern = [{"POS": "PROPN"}, {"LIKE_NUM": True}, | |
| {"text": ","}, {"LIKE_NUM": True}] | |
| date_matcher.add("EX_DATE", None, pattern) | |
| if len(date_matcher(doc)) > 0: | |
| match = date_matcher(doc)[0] | |
| return doc[match[1]:match[2]] | |
| else: | |
| return False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_amount_headline(doc): | |
| dividend_matcher = Matcher(nlp.vocab) | |
| pattern = [{"ORTH": "US$"}, {"LIKE_NUM": True}] | |
| dividend_matcher.add("USD", None, pattern) | |
| if len(dividend_matcher(doc)) > 0: | |
| match = dividend_matcher(doc)[0] | |
| return doc[match[1]:match[2]] | |
| else: | |
| return False | |
| doc = nlp("There's A Lot To Like About ConnectOne Bancorp's (NASDAQ:CNOB) Upcoming US$0.13 Dividend") | |
| print(get_amount_headline(doc)) | |
| # OUTPUT | |
| # US$0.13 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_org(doc): | |
| org_matcher = Matcher(nlp.vocab) | |
| pattern = [{'POS': 'PROPN', 'OP': '+'}, | |
| {'POS': 'CCONJ', 'OP': '?'}, | |
| {'POS': 'PROPN', 'OP': '*'}, | |
| {'ORTH': '\'', 'OP': '?'}, | |
| {'ORTH': '\'s', 'OP': '?'}, | |
| {'ORTH': '(', 'OP': '+'}] | |
| org_matcher.add("ORG", None, pattern) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| matches = org_matcher(doc) | |
| if len(matches) == 0: | |
| return f"{doc.text} -> NO MATCH FOUND" | |
| elif len(matches) == 1: | |
| match_idx = matches[0] | |
| else: | |
| max_len = 0 | |
| for m in matches: | |
| if m[2] - m[1] > max_len: | |
| max_len = m[2] - m[1] | |
| match_idx = m |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| return doc[match_idx[1]:match_idx[2]-1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_pay_date(doc): | |
| pay_date_matcher = Matcher(nlp.vocab) | |
| pattern = [{"ORTH": "paid"}, {"ORTH": "on"}, | |
| {"POS": "PROPN"}, {"LIKE_NUM": True}, | |
| {"ORTH": ","}, {"LIKE_NUM": True}, | |
| {"ORTH": "."}] | |
| pay_date_matcher.add("AMOUNT", None, pattern) | |
| match = pay_date_matcher(doc)[0] | |
| return doc[match[1] + 2:match[2]-1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_amount_summary(doc): | |
| per_share_matcher = Matcher(nlp.vocab) | |
| pattern = [{"ORTH": "$"}, {"LIKE_NUM": True}, | |
| {"LOWER": "per"}, {"LOWER": "share"}] | |
| per_share_matcher.add("AMOUNT", None, pattern) | |
| match = per_share_matcher(doc)[0] | |
| return doc[match[1]:match[2]-2] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_ticker(doc): | |
| org_matcher = Matcher(nlp.vocab) | |
| pattern = [{'ORTH': '('}, {'IS_ALPHA': True}, | |
| {'ORTH': ':', 'OP': '*'}, | |
| {'IS_ALPHA': True, 'OP': '*'}, | |
| {'ORTH': ')'}] | |
| org_matcher.add("ORG", None, pattern) | |
| match = org_matcher(doc) | |
| if len(match) == 0: | |
| return f"{doc.text} -> NO MATCH FOUND" | |
| else: | |
| return doc[match[0][1]:match[0][2]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #add pattern to matcher | |
| matcher.add("HELLO_WORLD", None, pattern) | |
| #create a doc of the string to be 'queried' | |
| doc = nlp("hello world!\nHello World.") | |
| matches = matcher(doc) | |
| for match_id, start, end in matches: | |
| span = doc[start:end] # The matched span | |
| print(match_id, start, end, span.text) | |
| # Output | |
| # 2008415248711360438 0 3 hello world! | |
| # 2008415248711360438 4 7 Hello World. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pattern = [{'LOWER': 'hello'}, | |
| {'LOWER': 'world'}, | |
| {'IS_PUNCT': True, 'OP': '+'}] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import time | |
| import spacy | |
| from spacy.matcher import Matcher |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import spacy | |
| from spacy.matcher import Matcher | |
| nlp=spacy.load('en_core_web_sm') | |
| matcher = Matcher(nlp.vocab) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| run_matcher = Matcher(nlp.vocab) | |
| pattern = [{"LEMMA": "run"}] | |
| run_matcher.add("RUN", None, pattern) | |
| doc = nlp("Only when it dawned on him that he had nowhere left to run to, he finally stopped running.") | |
| matches = run_matcher(doc) | |
| for match_id, start, end in matches: | |
| span = doc[start:end] | |
| print(start, end, span.text) | |
| # Output | |
| # 12 13 run | |
| # 18 19 running |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| with open('data.json', 'r') as f: | |
| news_articles = json.load(f) | |
| print(news_articles[0]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| buy_matcher = Matcher(nlp.vocab) | |
| pattern = [{"LEMMA": {"IN": ["acquire", "buy", 'purchase']}}, | |
| {"POS": {"IN": ["NOUN", "ADJ"]}, 'OP': '+'}] | |
| buy_matcher.add("BUY", None, pattern) | |
| doc = nlp("While his friends were buying things they didn't need, Charlie was busy acquiring productive assets.") | |
| matches = buy_matcher(doc) | |
| for match_id, start, end in matches: | |
| span = doc[start:end] | |
| print(start, end, span.text) | |
| # Output | |
| # 4 6 buying things | |
| # 14 16 acquiring productive | |
| # 14 17 acquiring productive assets |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021.") | |
| for token in doc: | |
| print(f"{token.text:15}, {token.pos_:<10}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| print("\n".join(art['title'] for art in news_articles)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| regex_matcher = Matcher(nlp.vocab) | |
| pattern = [{"LOWER": {"REGEX": "colou?r"}}] | |
| regex_matcher.add("BUY", None, pattern) | |
| doc = nlp("Color is the spelling used in the United States. Colour is used in other English-speaking countries.") | |
| matches = regex_matcher(doc) | |
| for match_id, start, end in matches: | |
| span = doc[start:end] | |
| print(start, end, span.text) | |
| # Output | |
| # 0 1 Color | |
| # 10 11 Colour |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| doc = nlp("BlackRock Energy and Resources Trust (BGR) Ex-Dividend Date Scheduled for November 12, 2021") | |
| print(get_org(doc)) | |
| print(get_ticker(doc)) | |
| # Output | |
| # BlackRock Energy and Resources Trust | |
| # NYSE |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021. A cash dividend payment of $0.1 per share is scheduled to be paid on November 30, 2021. Shareholders who purchased BIGZ prior to the ex-dividend date are eligible for the cash dividend payment. This marks the 6th quarter that BIGZ has paid the same dividend. At the current stock price of $17.99, the dividend yield is 6.67%.") | |
| print(get_amount_summary(doc)) | |
| print(get_pay_date(doc)) | |
| # Output | |
| # $0.1 | |
| # November 30, 2021 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment