cr-search/main.py

from html import unescape

from peewee import fn
from srt import parse

from models import Person, Line, Episode, db
from typo import fix_typo
from utils import td_to_milliseconds, get_filename

campaign = 2

for episode_nr in range(1, 95):
    file = get_filename(campaign, episode_nr)
    text = file.read_text()
    subtitlelines = parse(text)
    print(episode_nr)
    person = None
    episode = Episode.get(season=campaign, episode_number=episode_nr)
    with db.atomic():
        i = 0
        for line in subtitlelines:
            i += 1
            assert i == line.index
            text = unescape(line.content)
            dbline = Line()
            if ":" in text:
                name, resttext = text.split(":", maxsplit=1)
                if name.isupper():
                    formatted_name = fix_typo(name.strip()).title()
                    if formatted_name == "San":
                        print(name.title())
                    person, created = Person.get_or_create(name=formatted_name)
                    text = resttext.strip()
            else:
                if text.startswith("(") and text.endswith(")"):
                    dbline.isnote = True
                    person = None
                elif text.startswith("[") and text.endswith("]"):
                    dbline.ismeta = True
                    person = None
            text = text.replace("\n", " ")
            dbline.text = text
            dbline.search_text = fn.to_tsvector('english', text)
            dbline.person = person
            dbline.starttime = td_to_milliseconds(line.start)
            dbline.endtime = td_to_milliseconds(line.end)
            dbline.episode = episode
            dbline.order = line.index
            dbline.save()