1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/import.py

72 lines
2.9 KiB
Python
Raw Normal View History

2020-03-08 18:48:14 +01:00
import re
from html import unescape
from peewee import fn
from srt import parse
from models import Person, Line, Episode, db
from typo import fix_typo
from utils import td_to_milliseconds, get_filename
def is_invalid_name(name: str) -> bool:
for a in ["PS", "P.S.", "II", "The US", "Metal Gear", "D&D"]:
if a.lower() in name.lower():
return True
return False
def main():
all_people = set()
for campaign in range(1, 3):
for episode in Episode.select().where((Episode.text_imported == False) & (Episode.season == campaign)):
with open("names.txt", "w") as f:
f.write("\n".join(sorted(p for p in all_people if "\n" not in p)))
file = get_filename(campaign, episode.video_number)
text = file.read_text()
subtitlelines = parse(text)
print(episode.video_number, episode.episode_number)
person = None
with db.atomic():
i = 0
for line in subtitlelines:
i += 1
assert i == line.index
text = unescape(line.content)
dbline = Line()
if ":" in text:
name, resttext = text.split(":", maxsplit=1)
if name and name[-1].isupper() and not is_invalid_name(name):
people = []
name = name.lower()
for word in re.split('[,&/]|and| an ', name):
word = word.strip()
word = fix_typo(word).title()
word = word.strip()
if word:
people.append(word)
all_people.update(people)
formatted_name = ", ".join(people)
person, created = Person.get_or_create(name=formatted_name, season=campaign)
text = resttext.strip()
else:
if text.startswith("(") and text.endswith(")"):
dbline.isnote = True
person = None
elif text.startswith("[") and text.endswith("]"):
dbline.ismeta = True
person = None
text = text.replace("\n", " ")
dbline.text = text
dbline.search_text = fn.to_tsvector('english', text)
dbline.person = person
dbline.starttime = td_to_milliseconds(line.start)
dbline.endtime = td_to_milliseconds(line.end)
dbline.episode = episode
dbline.order = line.index
dbline.save()
if __name__ == '__main__':
main()