2020-08-08 15:26:13 +02:00
|
|
|
import os
|
2020-08-30 21:52:47 +02:00
|
|
|
from dataclasses import dataclass
|
|
|
|
from typing import Dict
|
2020-03-07 10:45:39 +01:00
|
|
|
|
2021-01-18 18:39:40 +01:00
|
|
|
import en_core_web_md
|
2020-03-17 12:51:08 +01:00
|
|
|
from alive_progress import alive_bar
|
2020-08-11 21:19:36 +02:00
|
|
|
from peewee import chunked
|
2021-01-18 18:39:40 +01:00
|
|
|
from spacy.lang.en import Language
|
2020-03-07 10:45:39 +01:00
|
|
|
from spacy.tokens.span import Span
|
|
|
|
from spacy.tokens.token import Token
|
|
|
|
|
|
|
|
from models import Episode, Line, db, Phrase
|
|
|
|
from stopwords import STOP_WORDS
|
|
|
|
|
2020-08-08 15:26:13 +02:00
|
|
|
os.nice(15)
|
2020-08-30 21:52:47 +02:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Noun:
|
|
|
|
name: str
|
|
|
|
count: int = 1
|
|
|
|
|
|
|
|
|
|
|
|
lemma_cache: Dict[str, str] = {}
|
2020-08-08 15:26:13 +02:00
|
|
|
|
2021-01-18 18:39:40 +01:00
|
|
|
nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])
|
2020-03-07 10:45:39 +01:00
|
|
|
nlp.Defaults.stop_words = STOP_WORDS
|
2020-08-13 15:08:39 +02:00
|
|
|
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(
|
|
|
|
Episode.id):
|
2020-04-15 18:11:45 +02:00
|
|
|
print(episode.video_number, episode.title)
|
2020-03-07 10:45:39 +01:00
|
|
|
person = None
|
|
|
|
text = ""
|
|
|
|
line_select = Line.select().where(Line.episode == episode)
|
2020-03-17 12:51:08 +01:00
|
|
|
with alive_bar(line_select.count(), title='Parsing lines') as bar:
|
2020-08-13 15:08:39 +02:00
|
|
|
for line in Line.select().where(Line.episode == episode).order_by(Line.order):
|
2020-03-17 12:51:08 +01:00
|
|
|
bar()
|
2020-03-07 10:45:39 +01:00
|
|
|
if line.person == person:
|
|
|
|
text += " " + line.text
|
|
|
|
else:
|
|
|
|
person = line.person
|
2020-08-13 15:08:39 +02:00
|
|
|
text += "\n" + line.text
|
2020-03-07 10:45:39 +01:00
|
|
|
|
|
|
|
delete = ["\"", "--", "(", ")", "[", "]"]
|
|
|
|
for string in delete:
|
|
|
|
text = text.replace(string, "")
|
|
|
|
print("run nlp")
|
|
|
|
doc = nlp(text)
|
2020-08-11 21:19:36 +02:00
|
|
|
print("nlp finished")
|
2020-08-30 21:52:47 +02:00
|
|
|
nouns: Dict[str, Noun] = {}
|
2020-08-13 15:08:39 +02:00
|
|
|
chunk: Span
|
2020-08-13 15:48:46 +02:00
|
|
|
noun_chunks = list(doc.noun_chunks)
|
2020-08-13 15:08:39 +02:00
|
|
|
with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
|
|
|
|
for chunk in noun_chunks:
|
|
|
|
bar()
|
|
|
|
tok: Token
|
2020-08-30 22:41:36 +02:00
|
|
|
noun_chunk = str(chunk).strip()
|
2020-08-13 15:48:46 +02:00
|
|
|
if noun_chunk in lemma_cache:
|
|
|
|
lemmas = lemma_cache[noun_chunk]
|
|
|
|
else:
|
2020-08-13 17:11:18 +02:00
|
|
|
lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
|
2020-08-13 15:48:46 +02:00
|
|
|
lemma_cache[noun_chunk] = lemmas
|
2020-08-13 15:08:39 +02:00
|
|
|
if lemmas not in nouns:
|
2020-08-30 21:52:47 +02:00
|
|
|
nouns[lemmas] = Noun(noun_chunk)
|
2020-08-13 15:08:39 +02:00
|
|
|
else:
|
2020-08-30 21:52:47 +02:00
|
|
|
nouns[lemmas].count += 1
|
2020-03-07 10:45:39 +01:00
|
|
|
with db.atomic():
|
2020-08-11 21:19:36 +02:00
|
|
|
phrases = []
|
2020-08-13 15:08:39 +02:00
|
|
|
for lemmas, data in nouns.items():
|
2020-08-30 21:52:47 +02:00
|
|
|
if "\n" in data.name:
|
2020-08-11 21:19:36 +02:00
|
|
|
continue
|
2020-08-30 21:52:47 +02:00
|
|
|
if len(data.name) < 4:
|
2020-08-11 21:19:36 +02:00
|
|
|
continue
|
2020-08-30 21:52:47 +02:00
|
|
|
phrases.append(Phrase(text=data.name, count=data.count, episode=episode))
|
2020-08-11 21:19:36 +02:00
|
|
|
|
|
|
|
num_per_chunk = 100
|
|
|
|
chunks = chunked(phrases, num_per_chunk)
|
|
|
|
with alive_bar(len(phrases) // num_per_chunk + 1) as bar:
|
|
|
|
for chunk in chunks:
|
2020-03-17 12:51:08 +01:00
|
|
|
bar()
|
2020-08-11 21:19:36 +02:00
|
|
|
Phrase.bulk_create(chunk)
|
|
|
|
|
|
|
|
episode.phrases_imported = True
|
|
|
|
episode.save()
|