mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-19 15:23:44 +02:00
proper typing in phrases
This commit is contained in:
parent
07ad517d0b
commit
807a240442
1 changed files with 17 additions and 12 deletions
29
phrases.py
29
phrases.py
|
@ -1,4 +1,6 @@
|
||||||
import os
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
import spacy as spacy
|
import spacy as spacy
|
||||||
from alive_progress import alive_bar
|
from alive_progress import alive_bar
|
||||||
|
@ -11,7 +13,15 @@ from models import Episode, Line, db, Phrase
|
||||||
from stopwords import STOP_WORDS
|
from stopwords import STOP_WORDS
|
||||||
|
|
||||||
os.nice(15)
|
os.nice(15)
|
||||||
lemma_cache = {}
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Noun:
|
||||||
|
name: str
|
||||||
|
count: int = 1
|
||||||
|
|
||||||
|
|
||||||
|
lemma_cache: Dict[str, str] = {}
|
||||||
|
|
||||||
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
|
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
|
||||||
nlp.Defaults.stop_words = STOP_WORDS
|
nlp.Defaults.stop_words = STOP_WORDS
|
||||||
|
@ -36,7 +46,7 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
|
||||||
print("run nlp")
|
print("run nlp")
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print("nlp finished")
|
print("nlp finished")
|
||||||
nouns = {}
|
nouns: Dict[str, Noun] = {}
|
||||||
chunk: Span
|
chunk: Span
|
||||||
noun_chunks = list(doc.noun_chunks)
|
noun_chunks = list(doc.noun_chunks)
|
||||||
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
|
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
|
||||||
|
@ -50,22 +60,17 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
|
||||||
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
|
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
|
||||||
lemma_cache[noun_chunk] = lemmas
|
lemma_cache[noun_chunk] = lemmas
|
||||||
if lemmas not in nouns:
|
if lemmas not in nouns:
|
||||||
nouns[lemmas] = {
|
nouns[lemmas] = Noun(noun_chunk)
|
||||||
"count": 1,
|
|
||||||
"name": noun_chunk
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
nouns[lemmas]["count"] += 1
|
nouns[lemmas].count += 1
|
||||||
with db.atomic():
|
with db.atomic():
|
||||||
phrases = []
|
phrases = []
|
||||||
for lemmas, data in nouns.items():
|
for lemmas, data in nouns.items():
|
||||||
phrase = data["name"]
|
if "\n" in data.name:
|
||||||
count = data["count"]
|
|
||||||
if "\n" in phrase:
|
|
||||||
continue
|
continue
|
||||||
if len(phrase) < 4:
|
if len(data.name) < 4:
|
||||||
continue
|
continue
|
||||||
phrases.append(Phrase(text=phrase, count=count, episode=episode))
|
phrases.append(Phrase(text=data.name, count=data.count, episode=episode))
|
||||||
|
|
||||||
num_per_chunk = 100
|
num_per_chunk = 100
|
||||||
chunks = chunked(phrases, num_per_chunk)
|
chunks = chunked(phrases, num_per_chunk)
|
||||||
|
|
Loading…
Reference in a new issue