From b63fc661290f234738e9a966b6be3f746949192c Mon Sep 17 00:00:00 2001 From: Lukas Winkler Date: Thu, 13 Aug 2020 15:48:46 +0200 Subject: [PATCH] cache lemmatisation of noun chunks --- phrases.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/phrases.py b/phrases.py index e199e00..a6231ab 100644 --- a/phrases.py +++ b/phrases.py @@ -11,6 +11,7 @@ from models import Episode, Line, db, Phrase from stopwords import STOP_WORDS os.nice(15) +lemma_cache = {} nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"]) nlp.Defaults.stop_words = STOP_WORDS @@ -37,13 +38,17 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi print("nlp finished") nouns = {} chunk: Span - noun_chunks=list(doc.noun_chunks) + noun_chunks = list(doc.noun_chunks) with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar: for chunk in noun_chunks: bar() tok: Token noun_chunk = "".join([tok.text_with_ws for tok in chunk if not tok.is_stop]).strip() - lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]) + if noun_chunk in lemma_cache: + lemmas = lemma_cache[noun_chunk] + else: + lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]) + lemma_cache[noun_chunk] = lemmas if lemmas not in nouns: nouns[lemmas] = { "count": 1,