From b63fc661290f234738e9a966b6be3f746949192c Mon Sep 17 00:00:00 2001
From: Lukas Winkler <git@lw1.at>
Date: Thu, 13 Aug 2020 15:48:46 +0200
Subject: [PATCH] cache lemmatisation of noun chunks

---
 phrases.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/phrases.py b/phrases.py
index e199e00..a6231ab 100644
--- a/phrases.py
+++ b/phrases.py
@@ -11,6 +11,7 @@ from models import Episode, Line, db, Phrase
 from stopwords import STOP_WORDS
 
 os.nice(15)
+lemma_cache = {}
 
 nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
 nlp.Defaults.stop_words = STOP_WORDS
@@ -37,13 +38,17 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
     print("nlp finished")
     nouns = {}
     chunk: Span
-    noun_chunks=list(doc.noun_chunks)
+    noun_chunks = list(doc.noun_chunks)
     with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
         for chunk in noun_chunks:
             bar()
             tok: Token
             noun_chunk = "".join([tok.text_with_ws for tok in chunk if not tok.is_stop]).strip()
-            lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
+            if noun_chunk in lemma_cache:
+                lemmas = lemma_cache[noun_chunk]
+            else:
+                lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
+                lemma_cache[noun_chunk] = lemmas
             if lemmas not in nouns:
                 nouns[lemmas] = {
                     "count": 1,