cr-search/phrases.py

import os
from dataclasses import dataclass

import en_core_web_md
from alive_progress import alive_bar
from peewee import chunked
from spacy.lang.en import Language
from spacy.tokens.span import Span
from spacy.tokens.token import Token

from app import db
from models import Episode, Line, Phrase
from stopwords import STOP_WORDS
from utils import clear_cache

os.nice(15)


@dataclass
class Noun:
    name: str
    count: int = 1


lemma_cache: dict[str, str] = {}

nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(
        Episode.id):
    print(episode.video_number, episode.pretty_title)
    person = None
    text = ""
    line_select = Line.select().where(Line.episode == episode)
    with alive_bar(line_select.count(), title='Parsing lines') as bar:
        for line in Line.select().where(Line.episode == episode).order_by(Line.order):
            bar()
            if line.person == person:
                text += " " + line.text
            else:
                person = line.person
                text += "\n" + line.text

    delete = ["\"", "--", "(", ")", "[", "]"]
    for string in delete:
        text = text.replace(string, "")
    print("run nlp")
    doc = nlp(text)
    print("nlp finished")
    nouns: dict[str, Noun] = {}
    chunk: Span
    noun_chunks = list(doc.noun_chunks)
    with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
        for chunk in noun_chunks:
            bar()
            tok: Token
            noun_chunk = str(chunk).strip()
            if noun_chunk in lemma_cache:
                lemmas = lemma_cache[noun_chunk]
            else:
                lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
                lemma_cache[noun_chunk] = lemmas
            if lemmas not in nouns:
                nouns[lemmas] = Noun(noun_chunk)
            else:
                nouns[lemmas].count += 1
    with db.atomic():
        phrases = []
        for lemmas, data in nouns.items():
            if "\n" in data.name:
                continue
            if len(data.name) < 4:
                continue
            phrases.append(Phrase(text=data.name, count=data.count, episode=episode))

        num_per_chunk = 100
        chunks = chunked(phrases, num_per_chunk)
        with alive_bar(len(phrases) // num_per_chunk + 1, title="saving") as bar:
            for chunk in chunks:
                bar()
                Phrase.bulk_create(chunk)

        episode.phrases_imported = True
        episode.save()
    clear_cache()
be nice 2020-08-08 15:26:13 +02:00			`import os`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`from dataclasses import dataclass`
initial commit 2020-03-07 10:45:39 +01:00
use spacy 3 2021-01-18 18:39:40 +01:00			`import en_core_web_md`
better progress bars 2020-03-17 12:51:08 +01:00			`from alive_progress import alive_bar`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`from peewee import chunked`
use spacy 3 2021-01-18 18:39:40 +01:00			`from spacy.lang.en import Language`
initial commit 2020-03-07 10:45:39 +01:00			`from spacy.tokens.span import Span`
			`from spacy.tokens.token import Token`

add redis caching and better database 2021-07-07 17:27:21 +02:00			`from app import db`
			`from models import Episode, Line, Phrase`
initial commit 2020-03-07 10:45:39 +01:00			`from stopwords import STOP_WORDS`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`from utils import clear_cache`
initial commit 2020-03-07 10:45:39 +01:00
be nice 2020-08-08 15:26:13 +02:00			`os.nice(15)`
proper typing in phrases 2020-08-30 21:52:47 +02:00

			`@dataclass`
			`class Noun:`
			`name: str`
			`count: int = 1`


pyupgrade 2023-04-25 22:51:19 +02:00			`lemma_cache: dict[str, str] = {}`
be nice 2020-08-08 15:26:13 +02:00
use spacy 3 2021-01-18 18:39:40 +01:00			`nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])`
initial commit 2020-03-07 10:45:39 +01:00			`nlp.Defaults.stop_words = STOP_WORDS`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(`
			`Episode.id):`
use pretty tile in progress bars 2021-07-04 22:58:08 +02:00			`print(episode.video_number, episode.pretty_title)`
initial commit 2020-03-07 10:45:39 +01:00			`person = None`
			`text = ""`
			`line_select = Line.select().where(Line.episode == episode)`
better progress bars 2020-03-17 12:51:08 +01:00			`with alive_bar(line_select.count(), title='Parsing lines') as bar:`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for line in Line.select().where(Line.episode == episode).order_by(Line.order):`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
initial commit 2020-03-07 10:45:39 +01:00			`if line.person == person:`
			`text += " " + line.text`
			`else:`
			`person = line.person`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`text += "\n" + line.text`
initial commit 2020-03-07 10:45:39 +01:00
			`delete = ["\"", "--", "(", ")", "[", "]"]`
			`for string in delete:`
			`text = text.replace(string, "")`
			`print("run nlp")`
			`doc = nlp(text)`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`print("nlp finished")`
pyupgrade 2023-04-25 22:51:19 +02:00			`nouns: dict[str, Noun] = {}`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`chunk: Span`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`noun_chunks = list(doc.noun_chunks)`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:`
			`for chunk in noun_chunks:`
			`bar()`
			`tok: Token`
don't remove stop words from noun chunks in phrases 2020-08-30 22:41:36 +02:00			`noun_chunk = str(chunk).strip()`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`if noun_chunk in lemma_cache:`
			`lemmas = lemma_cache[noun_chunk]`
			`else:`
add Mini Primetime 2020-08-13 17:11:18 +02:00			`lemmas = "\|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`lemma_cache[noun_chunk] = lemmas`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`if lemmas not in nouns:`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`nouns[lemmas] = Noun(noun_chunk)`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`else:`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`nouns[lemmas].count += 1`
initial commit 2020-03-07 10:45:39 +01:00			`with db.atomic():`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`phrases = []`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for lemmas, data in nouns.items():`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`if "\n" in data.name:`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`continue`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`if len(data.name) < 4:`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`continue`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`phrases.append(Phrase(text=data.name, count=data.count, episode=episode))`
speed up phrases generation 2020-08-11 21:19:36 +02:00
			`num_per_chunk = 100`
			`chunks = chunked(phrases, num_per_chunk)`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`with alive_bar(len(phrases) // num_per_chunk + 1, title="saving") as bar:`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`for chunk in chunks:`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`Phrase.bulk_create(chunk)`

			`episode.phrases_imported = True`
			`episode.save()`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`clear_cache()`