mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-19 15:23:44 +02:00
50 lines
1.7 KiB
Python
50 lines
1.7 KiB
Python
from collections import Counter
|
|
|
|
import spacy as spacy
|
|
from alive_progress import alive_bar
|
|
from spacy.lang.en import English
|
|
from spacy.tokens.span import Span
|
|
from spacy.tokens.token import Token
|
|
|
|
from models import Episode, Line, db, Phrase
|
|
from stopwords import STOP_WORDS
|
|
|
|
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
|
|
nlp.Defaults.stop_words = STOP_WORDS
|
|
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):
|
|
print(f"Campaign {episode.season} Episode {episode.episode_number}")
|
|
person = None
|
|
text = ""
|
|
line_select = Line.select().where(Line.episode == episode)
|
|
with alive_bar(line_select.count(), title='Parsing lines') as bar:
|
|
for line in Line.select().where(Line.episode == episode):
|
|
bar()
|
|
if line.person == person:
|
|
text += " " + line.text
|
|
else:
|
|
person = line.person
|
|
text += "\n"
|
|
|
|
delete = ["\"", "--", "(", ")", "[", "]"]
|
|
for string in delete:
|
|
text = text.replace(string, "")
|
|
print("run nlp")
|
|
doc = nlp(text)
|
|
nouns = set()
|
|
span: Span
|
|
for span in doc.noun_chunks:
|
|
tok: Token
|
|
noun_chunk = "".join([tok.text_with_ws for tok in span if not tok.is_stop]).strip()
|
|
nouns.add(noun_chunk)
|
|
cnt = Counter(nouns)
|
|
with db.atomic():
|
|
with alive_bar(len(cnt), title='inserting phrases') as bar:
|
|
for phrase, count in cnt.items():
|
|
bar()
|
|
if "\n" in phrase:
|
|
continue
|
|
if len(phrase) < 4:
|
|
continue
|
|
Phrase.create(text=phrase, count=count, episode=episode)
|
|
episode.phrases_imported = True
|
|
episode.save()
|