1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/phrases.py

86 lines
2.6 KiB
Python
Raw Permalink Normal View History

2020-08-08 15:26:13 +02:00
import os
2020-08-30 21:52:47 +02:00
from dataclasses import dataclass
2020-03-07 10:45:39 +01:00
2021-01-18 18:39:40 +01:00
import en_core_web_md
2020-03-17 12:51:08 +01:00
from alive_progress import alive_bar
2020-08-11 21:19:36 +02:00
from peewee import chunked
2021-01-18 18:39:40 +01:00
from spacy.lang.en import Language
2020-03-07 10:45:39 +01:00
from spacy.tokens.span import Span
from spacy.tokens.token import Token
2021-07-07 17:27:21 +02:00
from app import db
from models import Episode, Line, Phrase
2020-03-07 10:45:39 +01:00
from stopwords import STOP_WORDS
2021-07-07 17:27:21 +02:00
from utils import clear_cache
2020-03-07 10:45:39 +01:00
2020-08-08 15:26:13 +02:00
os.nice(15)
2020-08-30 21:52:47 +02:00
@dataclass
class Noun:
name: str
count: int = 1
2023-04-25 22:51:19 +02:00
lemma_cache: dict[str, str] = {}
2020-08-08 15:26:13 +02:00
2021-01-18 18:39:40 +01:00
nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])
2020-03-07 10:45:39 +01:00
nlp.Defaults.stop_words = STOP_WORDS
2020-08-13 15:08:39 +02:00
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(
Episode.id):
2021-07-04 22:58:08 +02:00
print(episode.video_number, episode.pretty_title)
2020-03-07 10:45:39 +01:00
person = None
text = ""
line_select = Line.select().where(Line.episode == episode)
2020-03-17 12:51:08 +01:00
with alive_bar(line_select.count(), title='Parsing lines') as bar:
2020-08-13 15:08:39 +02:00
for line in Line.select().where(Line.episode == episode).order_by(Line.order):
2020-03-17 12:51:08 +01:00
bar()
2020-03-07 10:45:39 +01:00
if line.person == person:
text += " " + line.text
else:
person = line.person
2020-08-13 15:08:39 +02:00
text += "\n" + line.text
2020-03-07 10:45:39 +01:00
delete = ["\"", "--", "(", ")", "[", "]"]
for string in delete:
text = text.replace(string, "")
print("run nlp")
doc = nlp(text)
2020-08-11 21:19:36 +02:00
print("nlp finished")
2023-04-25 22:51:19 +02:00
nouns: dict[str, Noun] = {}
2020-08-13 15:08:39 +02:00
chunk: Span
2020-08-13 15:48:46 +02:00
noun_chunks = list(doc.noun_chunks)
2020-08-13 15:08:39 +02:00
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
for chunk in noun_chunks:
bar()
tok: Token
noun_chunk = str(chunk).strip()
2020-08-13 15:48:46 +02:00
if noun_chunk in lemma_cache:
lemmas = lemma_cache[noun_chunk]
else:
2020-08-13 17:11:18 +02:00
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
2020-08-13 15:48:46 +02:00
lemma_cache[noun_chunk] = lemmas
2020-08-13 15:08:39 +02:00
if lemmas not in nouns:
2020-08-30 21:52:47 +02:00
nouns[lemmas] = Noun(noun_chunk)
2020-08-13 15:08:39 +02:00
else:
2020-08-30 21:52:47 +02:00
nouns[lemmas].count += 1
2020-03-07 10:45:39 +01:00
with db.atomic():
2020-08-11 21:19:36 +02:00
phrases = []
2020-08-13 15:08:39 +02:00
for lemmas, data in nouns.items():
2020-08-30 21:52:47 +02:00
if "\n" in data.name:
2020-08-11 21:19:36 +02:00
continue
2020-08-30 21:52:47 +02:00
if len(data.name) < 4:
2020-08-11 21:19:36 +02:00
continue
2020-08-30 21:52:47 +02:00
phrases.append(Phrase(text=data.name, count=data.count, episode=episode))
2020-08-11 21:19:36 +02:00
num_per_chunk = 100
chunks = chunked(phrases, num_per_chunk)
with alive_bar(len(phrases) // num_per_chunk + 1, title="saving") as bar:
2020-08-11 21:19:36 +02:00
for chunk in chunks:
2020-03-17 12:51:08 +01:00
bar()
2020-08-11 21:19:36 +02:00
Phrase.bulk_create(chunk)
episode.phrases_imported = True
episode.save()
2021-07-07 17:27:21 +02:00
clear_cache()