cr-search/phrases.py

from collections import Counter

import spacy as spacy
from alive_progress import alive_bar
from spacy.lang.en import English
from spacy.tokens.span import Span
from spacy.tokens.token import Token

from models import Episode, Line, db, Phrase
from stopwords import STOP_WORDS

nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):
    print(f"Campaign {episode.season} Episode {episode.episode_number}")
    person = None
    text = ""
    line_select = Line.select().where(Line.episode == episode)
    with alive_bar(line_select.count(), title='Parsing lines') as bar:
        for line in Line.select().where(Line.episode == episode):
            bar()
            if line.person == person:
                text += " " + line.text
            else:
                person = line.person
                text += "\n"

    delete = ["\"", "--", "(", ")", "[", "]"]
    for string in delete:
        text = text.replace(string, "")
    print("run nlp")
    doc = nlp(text)
    nouns = set()
    span: Span
    for span in doc.noun_chunks:
        tok: Token
        noun_chunk = "".join([tok.text_with_ws for tok in span if not tok.is_stop]).strip()
        nouns.add(noun_chunk)
    cnt = Counter(nouns)
    with db.atomic():
        with alive_bar(len(cnt), title='inserting phrases') as bar:
            for phrase, count in cnt.items():
                bar()
                if "\n" in phrase:
                    continue
                if len(phrase) < 4:
                    continue
                Phrase.create(text=phrase, count=count, episode=episode)
    episode.phrases_imported = True
    episode.save()
initial commit 2020-03-07 10:45:39 +01:00			`from collections import Counter`

			`import spacy as spacy`
better progress bars 2020-03-17 12:51:08 +01:00			`from alive_progress import alive_bar`
initial commit 2020-03-07 10:45:39 +01:00			`from spacy.lang.en import English`
			`from spacy.tokens.span import Span`
			`from spacy.tokens.token import Token`

			`from models import Episode, Line, db, Phrase`
			`from stopwords import STOP_WORDS`

			`nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])`
			`nlp.Defaults.stop_words = STOP_WORDS`
don't import empty phrases 2020-03-08 18:57:37 +01:00			`for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):`
many major changes 2020-03-08 18:48:14 +01:00			`print(f"Campaign {episode.season} Episode {episode.episode_number}")`
initial commit 2020-03-07 10:45:39 +01:00			`person = None`
			`text = ""`
			`line_select = Line.select().where(Line.episode == episode)`
better progress bars 2020-03-17 12:51:08 +01:00			`with alive_bar(line_select.count(), title='Parsing lines') as bar:`
initial commit 2020-03-07 10:45:39 +01:00			`for line in Line.select().where(Line.episode == episode):`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
initial commit 2020-03-07 10:45:39 +01:00			`if line.person == person:`
			`text += " " + line.text`
			`else:`
			`person = line.person`
			`text += "\n"`

			`delete = ["\"", "--", "(", ")", "[", "]"]`
			`for string in delete:`
			`text = text.replace(string, "")`
			`print("run nlp")`
			`doc = nlp(text)`
more efficent suggestions 2020-03-08 14:48:04 +01:00			`nouns = set()`
initial commit 2020-03-07 10:45:39 +01:00			`span: Span`
			`for span in doc.noun_chunks:`
			`tok: Token`
			`noun_chunk = "".join([tok.text_with_ws for tok in span if not tok.is_stop]).strip()`
more efficent suggestions 2020-03-08 14:48:04 +01:00			`nouns.add(noun_chunk)`
			`cnt = Counter(nouns)`
initial commit 2020-03-07 10:45:39 +01:00			`with db.atomic():`
better progress bars 2020-03-17 12:51:08 +01:00			`with alive_bar(len(cnt), title='inserting phrases') as bar:`
initial commit 2020-03-07 10:45:39 +01:00			`for phrase, count in cnt.items():`
better progress bars 2020-03-17 12:51:08 +01:00			`bar()`
initial commit 2020-03-07 10:45:39 +01:00			`if "\n" in phrase:`
			`continue`
			`if len(phrase) < 4:`
			`continue`
more efficent suggestions 2020-03-08 14:48:04 +01:00			`Phrase.create(text=phrase, count=count, episode=episode)`
save flags 2020-03-08 20:41:06 +01:00			`episode.phrases_imported = True`
			`episode.save()`