1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/phrases.py

66 lines
2 KiB
Python
Raw Normal View History

2020-08-08 15:26:13 +02:00
import os
2020-03-07 10:45:39 +01:00
from collections import Counter
import spacy as spacy
2020-03-17 12:51:08 +01:00
from alive_progress import alive_bar
2020-08-11 21:19:36 +02:00
from peewee import chunked
2020-03-07 10:45:39 +01:00
from spacy.lang.en import English
from spacy.tokens.span import Span
from spacy.tokens.token import Token
from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS
2020-08-08 15:26:13 +02:00
os.nice(15)
2020-03-07 10:45:39 +01:00
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
2020-03-08 18:57:37 +01:00
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):
2020-08-11 21:19:36 +02:00
if episode.id != 167:
continue
2020-04-15 18:11:45 +02:00
print(episode.video_number, episode.title)
2020-03-07 10:45:39 +01:00
person = None
text = ""
line_select = Line.select().where(Line.episode == episode)
2020-03-17 12:51:08 +01:00
with alive_bar(line_select.count(), title='Parsing lines') as bar:
2020-03-07 10:45:39 +01:00
for line in Line.select().where(Line.episode == episode):
2020-03-17 12:51:08 +01:00
bar()
2020-03-07 10:45:39 +01:00
if line.person == person:
text += " " + line.text
else:
person = line.person
text += "\n"
delete = ["\"", "--", "(", ")", "[", "]"]
for string in delete:
text = text.replace(string, "")
print("run nlp")
2020-08-11 21:19:36 +02:00
print(text)
2020-03-07 10:45:39 +01:00
doc = nlp(text)
2020-08-11 21:19:36 +02:00
print("nlp finished")
2020-03-08 14:48:04 +01:00
nouns = set()
2020-03-07 10:45:39 +01:00
span: Span
2020-08-11 21:19:36 +02:00
for chunk in doc.noun_chunks:
2020-03-07 10:45:39 +01:00
tok: Token
2020-08-11 21:19:36 +02:00
noun_chunk = chunk.text
2020-03-08 14:48:04 +01:00
nouns.add(noun_chunk)
cnt = Counter(nouns)
2020-03-07 10:45:39 +01:00
with db.atomic():
2020-08-11 21:19:36 +02:00
phrases = []
for phrase, count in cnt.items():
if "\n" in phrase:
continue
if len(phrase) < 4:
continue
phrases.append(Phrase(text=phrase, count=count, episode=episode))
num_per_chunk = 100
chunks = chunked(phrases, num_per_chunk)
with alive_bar(len(phrases) // num_per_chunk + 1) as bar:
for chunk in chunks:
2020-03-17 12:51:08 +01:00
bar()
2020-08-11 21:19:36 +02:00
Phrase.bulk_create(chunk)
episode.phrases_imported = True
episode.save()