Created
July 22, 2018 12:08
-
-
Save dhruvpathak/a7b96f469fd404a2351de69a5ff41144 to your computer and use it in GitHub Desktop.
Lemmatization and POS tag correlation in spaCy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # tested on : python 3.7.0, spacy 2.0.12 | |
| import urllib | |
| import spacy | |
| from collections import defaultdict | |
| from pprint import pprint | |
| nlp = spacy.load('en') | |
| #fetch a long essay text | |
| text_url = "https://pastebin.com/raw/M7RwNi5q" | |
| input_text = urllib.request.urlopen(text_url).read().decode('utf-8','ignore') | |
| parsed = nlp(input_text) | |
| #a dict to have text and POS tag of a token as key, and its lemmas as list in value | |
| lemma_map = defaultdict(list) | |
| # store all the lemmas against text+POS key of the tokens | |
| for token in parsed: | |
| hash_key = '{0}_{1}'.format(token.text, token.pos_) | |
| lemma_map[hash_key].append(token.lemma_) | |
| #check printed output of words & their lemmas | |
| sorted_items = sorted(lemma_map.items(),key= lambda item:-len(item[1])) | |
| pprint(sorted_items) | |
| #check if for a combination of token's text & its POS tag, are there any | |
| # different lemmas ? | |
| for key,value in lemma_map.items(): | |
| if len(set(value)) == 1: | |
| print('lemmas SAME for key:{0},lemma:{1}'.format(key,value[0])) | |
| else: | |
| print('lemmas DIFF for key:{0},lemmas:{1}'.format(key,value)) | |
| # observation: for a token, given its role in a sentence, with a | |
| # common POS tag, its lemma form is the same for this data. |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Result snippets: