1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/phrases.py

78 lines
2.6 KiB
Python

import os
import spacy as spacy
from alive_progress import alive_bar
from peewee import chunked
from spacy.lang.en import English
from spacy.tokens.span import Span
from spacy.tokens.token import Token
from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS
os.nice(15)
lemma_cache = {}
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(
Episode.id):
print(episode.video_number, episode.title)
person = None
text = ""
line_select = Line.select().where(Line.episode == episode)
with alive_bar(line_select.count(), title='Parsing lines') as bar:
for line in Line.select().where(Line.episode == episode).order_by(Line.order):
bar()
if line.person == person:
text += " " + line.text
else:
person = line.person
text += "\n" + line.text
delete = ["\"", "--", "(", ")", "[", "]"]
for string in delete:
text = text.replace(string, "")
print("run nlp")
doc = nlp(text)
print("nlp finished")
nouns = {}
chunk: Span
noun_chunks = list(doc.noun_chunks)
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
for chunk in noun_chunks:
bar()
tok: Token
noun_chunk = "".join([tok.text_with_ws for tok in chunk if not tok.is_stop]).strip()
if noun_chunk in lemma_cache:
lemmas = lemma_cache[noun_chunk]
else:
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)])
lemma_cache[noun_chunk] = lemmas
if lemmas not in nouns:
nouns[lemmas] = {
"count": 1,
"name": noun_chunk
}
else:
nouns[lemmas]["count"] += 1
with db.atomic():
phrases = []
for lemmas, data in nouns.items():
phrase = data["name"]
count = data["count"]
if "\n" in phrase:
continue
if len(phrase) < 4:
continue
phrases.append(Phrase(text=phrase, count=count, episode=episode))
num_per_chunk = 100
chunks = chunked(phrases, num_per_chunk)
with alive_bar(len(phrases) // num_per_chunk + 1) as bar:
for chunk in chunks:
bar()
Phrase.bulk_create(chunk)
episode.phrases_imported = True
episode.save()