cr-search/phrases.py

import os
from dataclasses import dataclass
from typing import Dict

import en_core_web_md
from alive_progress import alive_bar
from peewee import chunked
from spacy.lang.en import Language
from spacy.tokens.span import Span
from spacy.tokens.token import Token

from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS

os.nice(15)


@dataclass
class Noun:
    name: str
    count: int = 1


lemma_cache: Dict[str, str] = {}

nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(
        Episode.id):
    print(episode.video_number, episode.title)
    person = None
    text = ""
    line_select = Line.select().where(Line.episode == episode)
    with alive_bar(line_select.count(), title='Parsing lines') as bar:
        for line in Line.select().where(Line.episode == episode).order_by(Line.order):
            bar()
            if line.person == person:
                text += " " + line.text
            else:
                person = line.person
                text += "\n" + line.text

    delete = ["\"", "--", "(", ")", "[", "]"]
    for string in delete:
        text = text.replace(string, "")
    print("run nlp")
    doc = nlp(text)
    print("nlp finished")
    nouns: Dict[str, Noun] = {}
    chunk: Span
    noun_chunks = list(doc.noun_chunks)
    with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:
        for chunk in noun_chunks:
            bar()
            tok: Token
            noun_chunk = str(chunk).strip()
            if noun_chunk in lemma_cache:
                lemmas = lemma_cache[noun_chunk]
            else:
                lemmas = "|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()
                lemma_cache[noun_chunk] = lemmas
            if lemmas not in nouns:
                nouns[lemmas] = Noun(noun_chunk)
            else:
                nouns[lemmas].count += 1
    with db.atomic():
        phrases = []
        for lemmas, data in nouns.items():
            if "\n" in data.name:
                continue
            if len(data.name) < 4:
                continue
            phrases.append(Phrase(text=data.name, count=data.count, episode=episode))

        num_per_chunk = 100
        chunks = chunked(phrases, num_per_chunk)
        with alive_bar(len(phrases) // num_per_chunk + 1) as bar:
            for chunk in chunks:
                bar()
                Phrase.bulk_create(chunk)

        episode.phrases_imported = True
        episode.save()
be nice 2020-08-08 15:26:13 +02:00			`import os`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`from dataclasses import dataclass`
			`from typing import Dict`
initial commit 2020-03-07 10:45:39 +01:00
use spacy 3 2021-01-18 18:39:40 +01:00			`import en_core_web_md`
better progress bars 2020-03-17 12:51:08 +01:00			`from alive_progress import alive_bar`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`from peewee import chunked`
use spacy 3 2021-01-18 18:39:40 +01:00			`from spacy.lang.en import Language`
initial commit 2020-03-07 10:45:39 +01:00			`from spacy.tokens.span import Span`
			`from spacy.tokens.token import Token`

			`from models import Episode, Line, db, Phrase`
			`from stopwords import STOP_WORDS`

be nice 2020-08-08 15:26:13 +02:00			`os.nice(15)`
proper typing in phrases 2020-08-30 21:52:47 +02:00

			`@dataclass`
			`class Noun:`
			`name: str`
			`count: int = 1`


			`lemma_cache: Dict[str, str] = {}`
be nice 2020-08-08 15:26:13 +02:00
use spacy 3 2021-01-18 18:39:40 +01:00			`nlp: Language = en_core_web_md.load(disable=["ner", "textcat"])`
initial commit 2020-03-07 10:45:39 +01:00			`nlp.Defaults.stop_words = STOP_WORDS`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)).order_by(`
			`Episode.id):`
generalise data model to series 2020-04-15 18:11:45 +02:00			`print(episode.video_number, episode.title)`
initial commit 2020-03-07 10:45:39 +01:00			`person = None`
			`text = ""`
			`line_select = Line.select().where(Line.episode == episode)`
better progress bars 2020-03-17 12:51:08 +01:00			`with alive_bar(line_select.count(), title='Parsing lines') as bar:`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for line in Line.select().where(Line.episode == episode).order_by(Line.order):`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
initial commit 2020-03-07 10:45:39 +01:00			`if line.person == person:`
			`text += " " + line.text`
			`else:`
			`person = line.person`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`text += "\n" + line.text`
initial commit 2020-03-07 10:45:39 +01:00
			`delete = ["\"", "--", "(", ")", "[", "]"]`
			`for string in delete:`
			`text = text.replace(string, "")`
			`print("run nlp")`
			`doc = nlp(text)`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`print("nlp finished")`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`nouns: Dict[str, Noun] = {}`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`chunk: Span`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`noun_chunks = list(doc.noun_chunks)`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`with alive_bar(len(noun_chunks), title='lemmatizing and counting') as bar:`
			`for chunk in noun_chunks:`
			`bar()`
			`tok: Token`
don't remove stop words from noun chunks in phrases 2020-08-30 22:41:36 +02:00			`noun_chunk = str(chunk).strip()`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`if noun_chunk in lemma_cache:`
			`lemmas = lemma_cache[noun_chunk]`
			`else:`
add Mini Primetime 2020-08-13 17:11:18 +02:00			`lemmas = "\|".join([token.lemma_ for token in nlp(noun_chunk)]).lower()`
cache lemmatisation of noun chunks 2020-08-13 15:48:46 +02:00			`lemma_cache[noun_chunk] = lemmas`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`if lemmas not in nouns:`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`nouns[lemmas] = Noun(noun_chunk)`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`else:`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`nouns[lemmas].count += 1`
initial commit 2020-03-07 10:45:39 +01:00			`with db.atomic():`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`phrases = []`
fix and improve phrase generation 2020-08-13 15:08:39 +02:00			`for lemmas, data in nouns.items():`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`if "\n" in data.name:`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`continue`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`if len(data.name) < 4:`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`continue`
proper typing in phrases 2020-08-30 21:52:47 +02:00			`phrases.append(Phrase(text=data.name, count=data.count, episode=episode))`
speed up phrases generation 2020-08-11 21:19:36 +02:00
			`num_per_chunk = 100`
			`chunks = chunked(phrases, num_per_chunk)`
			`with alive_bar(len(phrases) // num_per_chunk + 1) as bar:`
			`for chunk in chunks:`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
speed up phrases generation 2020-08-11 21:19:36 +02:00			`Phrase.bulk_create(chunk)`

			`episode.phrases_imported = True`
			`episode.save()`