dhruvpathak/spacy_lemma.py

## spacy_lemma.py
# tested on : python 3.7.0, spacy 2.0.12

import urllib
import spacy
from collections import defaultdict
from pprint import pprint
nlp = spacy.load('en')

#fetch a long essay text
text_url = "https://pastebin.com/raw/M7RwNi5q"
input_text = urllib.request.urlopen(text_url).read().decode('utf-8','ignore')

parsed = nlp(input_text)

#a dict to have text and POS tag of a token as key, and its lemmas as list in value
lemma_map = defaultdict(list)

# store all the lemmas against text+POS key of the tokens
for token in parsed:
    hash_key = '{0}_{1}'.format(token.text, token.pos_)
    lemma_map[hash_key].append(token.lemma_)

#check printed output of words & their lemmas
sorted_items = sorted(lemma_map.items(),key= lambda item:-len(item[1]))
pprint(sorted_items)

#check if for a combination of token's text & its POS tag, are there any
# different lemmas ?
for key,value in lemma_map.items():
    if len(set(value)) == 1:
        print('lemmas SAME for key:{0},lemma:{1}'.format(key,value[0]))
    else:
        print('lemmas DIFF for key:{0},lemmas:{1}'.format(key,value))

# observation: for a token, given its role in a sentence, with a
# common POS tag, its lemma form is the same for this data.
	# tested on : python 3.7.0, spacy 2.0.12

	import urllib
	import spacy
	from collections import defaultdict
	from pprint import pprint
	nlp = spacy.load('en')

	#fetch a long essay text
	text_url = "https://pastebin.com/raw/M7RwNi5q"
	input_text = urllib.request.urlopen(text_url).read().decode('utf-8','ignore')

	parsed = nlp(input_text)

	#a dict to have text and POS tag of a token as key, and its lemmas as list in value
	lemma_map = defaultdict(list)

	# store all the lemmas against text+POS key of the tokens
	for token in parsed:
	hash_key = '{0}_{1}'.format(token.text, token.pos_)
	lemma_map[hash_key].append(token.lemma_)

	#check printed output of words & their lemmas
	sorted_items = sorted(lemma_map.items(),key= lambda item:-len(item[1]))
	pprint(sorted_items)

	#check if for a combination of token's text & its POS tag, are there any
	# different lemmas ?
	for key,value in lemma_map.items():
	if len(set(value)) == 1:
	print('lemmas SAME for key:{0},lemma:{1}'.format(key,value[0]))
	else:
	print('lemmas DIFF for key:{0},lemmas:{1}'.format(key,value))

	# observation: for a token, given its role in a sentence, with a
	# common POS tag, its lemma form is the same for this data.
No results found