1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00

cache lemmatisation of noun chunks

This commit is contained in:
Lukas Winkler 2020-08-13 15:48:46 +02:00
parent 6987d5cb2e
commit b63fc66129
Signed by: lukas
GPG key ID: 54DE4D798D244853

View file

@ -11,6 +11,7 @@ from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS
os.nice(15)
lemma_cache = {}
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
@ -37,13 +38,17 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
print("nlp finished")
nouns = {}
chunk: Span
noun_chunks=list(doc.noun_chunks)
noun_chunks = list(doc.noun_chunks)
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
for chunk in noun_chunks:
bar()
tok: Token
noun_chunk = "".join([tok.text_with_ws for tok in chunk if not tok.is_stop]).strip()
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
if noun_chunk in lemma_cache:
lemmas = lemma_cache[noun_chunk]
else:
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
lemma_cache[noun_chunk] = lemmas
if lemmas not in nouns:
nouns[lemmas] = {
"count": 1,