mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-19 15:23:44 +02:00
cache lemmatisation of noun chunks
This commit is contained in:
parent
6987d5cb2e
commit
b63fc66129
1 changed files with 7 additions and 2 deletions
|
@ -11,6 +11,7 @@ from models import Episode, Line, db, Phrase
|
|||
from stopwords import STOP_WORDS
|
||||
|
||||
os.nice(15)
|
||||
lemma_cache = {}
|
||||
|
||||
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
|
||||
nlp.Defaults.stop_words = STOP_WORDS
|
||||
|
@ -37,13 +38,17 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
|
|||
print("nlp finished")
|
||||
nouns = {}
|
||||
chunk: Span
|
||||
noun_chunks=list(doc.noun_chunks)
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
|
||||
for chunk in noun_chunks:
|
||||
bar()
|
||||
tok: Token
|
||||
noun_chunk = "".join([tok.text_with_ws for tok in chunk if not tok.is_stop]).strip()
|
||||
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
|
||||
if noun_chunk in lemma_cache:
|
||||
lemmas = lemma_cache[noun_chunk]
|
||||
else:
|
||||
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
|
||||
lemma_cache[noun_chunk] = lemmas
|
||||
if lemmas not in nouns:
|
||||
nouns[lemmas] = {
|
||||
"count": 1,
|
||||
|
|
Loading…
Reference in a new issue