1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/phrases.py

49 lines
1.7 KiB
Python
Raw Normal View History

2020-03-07 10:45:39 +01:00
from collections import Counter
import spacy as spacy
from progress.bar import IncrementalBar
from spacy.lang.en import English
from spacy.tokens.span import Span
from spacy.tokens.token import Token
from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
2020-03-08 18:48:14 +01:00
for episode in Episode.select().where(Episode.phrases_imported == False):
print(f"Campaign {episode.season} Episode {episode.episode_number}")
2020-03-07 10:45:39 +01:00
person = None
text = ""
line_select = Line.select().where(Line.episode == episode)
with IncrementalBar('Parsing lines', max=line_select.count(), suffix="%(percent).1f%% - %(eta)ds") as bar:
for line in Line.select().where(Line.episode == episode):
bar.next()
if line.person == person:
text += " " + line.text
else:
person = line.person
text += "\n"
delete = ["\"", "--", "(", ")", "[", "]"]
for string in delete:
text = text.replace(string, "")
print("run nlp")
doc = nlp(text)
2020-03-08 14:48:04 +01:00
nouns = set()
2020-03-07 10:45:39 +01:00
span: Span
for span in doc.noun_chunks:
tok: Token
noun_chunk = "".join([tok.text_with_ws for tok in span if not tok.is_stop]).strip()
2020-03-08 14:48:04 +01:00
nouns.add(noun_chunk)
cnt = Counter(nouns)
2020-03-07 10:45:39 +01:00
with db.atomic():
with IncrementalBar('inserting phrases', max=len(cnt)) as bar:
for phrase, count in cnt.items():
bar.next()
if "\n" in phrase:
continue
if len(phrase) < 4:
continue
2020-03-08 14:48:04 +01:00
Phrase.create(text=phrase, count=count, episode=episode)