diff --git a/colors.py b/colors.py index 7df30b5..e63fe65 100644 --- a/colors.py +++ b/colors.py @@ -1,16 +1,6 @@ +from data import colors_c2 from models import Person -colors_c2 = { - "Laura": "#59c3f9", - "Marisha": "#00146e", - "Liam": "#fe8413", - "Taliesin": "#be1c0d", - "Ashley": "#868984", - "Sam": "#dae1dd", - "Travis": "#076708", - "Matt": "#471f0e" # random color -} - p: Person for p in Person.select(): print(p) diff --git a/config.sample.py b/config.sample.py index c5568c3..d1ab2c3 100644 --- a/config.sample.py +++ b/config.sample.py @@ -7,5 +7,3 @@ dbauth = { } sentryDSN = None - -skip_download = False diff --git a/createdb.py b/createdb.py index 656769a..73c3da8 100644 --- a/createdb.py +++ b/createdb.py @@ -1,6 +1,6 @@ from sys import argv -from models import db, Phrase, Episode, Person, Line +from models import db, Series, Phrase, Episode, Person, Line def confirm(message: str) -> None: @@ -14,8 +14,8 @@ mode = argv[1] if mode == "all": confirm("Delete all Data? ") - db.drop_tables([Episode, Person, Line, Phrase]) - db.create_tables([Episode, Person, Line, Phrase]) + db.drop_tables([Series, Episode, Person, Line, Phrase]) + db.create_tables([Series, Episode, Person, Line, Phrase]) elif mode == "phrases": confirm("Delete all Phrases? ") db.drop_tables([Phrase]) diff --git a/fetch.py b/fetch.py index 9ced4d6..fc0f2d0 100644 --- a/fetch.py +++ b/fetch.py @@ -2,25 +2,49 @@ import re from subprocess import run import youtube_dl +from peewee import DoesNotExist -import config -from models import Episode +from models import Episode, Series +# https://www.youtube.com/playlist?list= from utils import srtdir -campaign_playlists = { - 1: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_", - 2: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2" -} +series_data = [ + { + "name": "Campaign 1", + "playlist_id": "PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_", + }, + { + "name": "Campaign 2", + "playlist_id": "PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2" + }, + { + "name": "Handbooker Helper", + "playlist_id": "PL1tiwbzkOjQyr6-gqJ8r29j_rJkR49uDN", + "single_speaker": True + } +] def main(): - for campaign in range(1, 3): + for series in series_data: + name = series["name"] + playlist_id = series["playlist_id"] + is_campaign = "Campaign" in name + try: + s = Series.select().where(Series.title == name).get() + except DoesNotExist: + s = Series() + s.title = name + + s.is_campaign = is_campaign + s.single_speaker = "single_speaker" in series and series["single_speaker"] + s.save() ydl_opts = { 'extract_flat': True } with youtube_dl.YoutubeDL(ydl_opts) as ydl: - playlist = ydl.extract_info(campaign_playlists[campaign], download=False) + playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False) videos = playlist["entries"] print(v["url"] for v in videos) @@ -33,30 +57,42 @@ def main(): regex = re.compile(r"Ep(?:is|si)ode (\d+)") for nr, video in enumerate(videos, 1): - vttfile = srtdir / f"C{campaign}E{nr}" - ydl_opts["outtmpl"] = str(vttfile) - if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1: - print(f"already imported {vttfile}") - continue - e = Episode() - e.season = campaign - e.video_number = nr + # if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1: + # print(f"already imported {vttfile}") + # continue try: - match = regex.search(video["title"]) - e.episode_number = int(match.group(1)) - except AttributeError: - if campaign == 1: # one-shots at the end of campaign 1 - e.episode_number = e.video_number - 3 - else: - raise + e = Episode.select().where((Episode.series == s) & (Episode.video_number == nr)).get() + except DoesNotExist: + e = Episode() + e.series = s + e.video_number = nr + e.title = video["title"] + if s.is_campaign: + try: + match = regex.search(video["title"]) + e.episode_number = int(match.group(1)) + except AttributeError: + if s.title == "Campaign 1": # one-shots at the end of campaign 1 + e.episode_number = e.video_number - 3 + else: + raise + else: + e.episode_number = e.video_number e.youtube_id = video["url"] e.save() - if config.skip_download: + vttfile = srtdir / str(e.id) + ydl_opts["outtmpl"] = str(vttfile) + if e.downloaded: continue with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}']) run(["ffmpeg", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")]) - vttfile.with_suffix(".en.vtt").unlink() + e.downloaded = True + try: + vttfile.with_suffix(".en.vtt").unlink() + except FileNotFoundError: + e.downloaded = False + e.save() if __name__ == '__main__': diff --git a/import.py b/import.py index be7e275..657eea9 100644 --- a/import.py +++ b/import.py @@ -5,9 +5,9 @@ from alive_progress import alive_bar from peewee import fn from srt import parse -from models import Person, Line, Episode, db +from models import Person, Line, Episode, db, Series from typo import fix_typo -from utils import td_to_milliseconds, get_filename +from utils import td_to_milliseconds, srtdir, episode_speaker def is_invalid_name(name: str) -> bool: @@ -19,14 +19,16 @@ def is_invalid_name(name: str) -> bool: def main(): all_people = set() - for campaign in range(1, 3): - for episode in Episode.select().where((Episode.text_imported == False) & (Episode.season == campaign)): + for series in Series.select(): + for episode in Episode.select().where( + (Episode.text_imported == False) & (Episode.series == series) & (Episode.downloaded) + ): with open("names.txt", "w") as f: f.write("\n".join(sorted(p for p in all_people if "\n" not in p))) - file = get_filename(campaign, episode.video_number) + file = srtdir / f"{episode.id}.srt" text = file.read_text() subtitlelines = list(parse(text)) - print(episode.video_number, episode.episode_number) + print(episode.video_number, episode.title) person = None with db.atomic(): with alive_bar(len(subtitlelines)) as bar: @@ -37,28 +39,32 @@ def main(): assert i == line.index text = unescape(line.content) dbline = Line() - if ":" in text: - name, resttext = text.split(":", maxsplit=1) - if name and name[-1].isupper() and not is_invalid_name(name): - people = [] - name = name.lower() - for word in re.split('[,&/]|and| an ', name): - word = word.strip() - word = fix_typo(word).title() - word = word.strip() - if word: - people.append(word) - all_people.update(people) - formatted_name = ", ".join(people) - person, created = Person.get_or_create(name=formatted_name, season=campaign) - text = resttext.strip() + if not series.single_speaker: + if ":" in text: + name, resttext = text.split(":", maxsplit=1) + if name and name[-1].isupper() and not is_invalid_name(name): + people = [] + name = name.lower() + for word in re.split('[,&/]|and| an ', name): + word = word.strip() + word = fix_typo(word).title() + word = word.strip() + if word: + people.append(word) + all_people.update(people) + formatted_name = ", ".join(people) + person, created = Person.get_or_create(name=formatted_name, series=series) + text = resttext.strip() + else: + if text.startswith("(") and text.endswith(")"): + dbline.isnote = True + person = None + elif text.startswith("[") and text.endswith("]"): + dbline.ismeta = True + person = None else: - if text.startswith("(") and text.endswith(")"): - dbline.isnote = True - person = None - elif text.startswith("[") and text.endswith("]"): - dbline.ismeta = True - person = None + person_name = episode_speaker(series.title, episode.video_number) + person,created = Person.get_or_create(name=person_name, series=series) text = text.replace("\n", " ") dbline.text = text dbline.search_text = fn.to_tsvector('english', text) diff --git a/models.py b/models.py index 5b8e584..e7c5624 100644 --- a/models.py +++ b/models.py @@ -12,16 +12,24 @@ class BaseModel(Model): database = db +class Series(BaseModel): + title = CharField(max_length=100) + is_campaign = BooleanField() + single_speaker = BooleanField() + + class Episode(BaseModel): - season = IntegerField() + series = ForeignKeyField(Series, backref="episodes") episode_number = IntegerField() video_number = IntegerField() youtube_id = CharField(max_length=11) + title = CharField(max_length=100) + downloaded = BooleanField(default=False) text_imported = BooleanField(default=False) phrases_imported = BooleanField(default=False) class Meta: - indexes = ((("season", "video_number"), True),) + indexes = ((("series", "video_number"), True),) @property def name(self) -> str: @@ -31,9 +39,10 @@ class Episode(BaseModel): class Person(BaseModel): name = CharField() color = CharField(null=True) - season = IntegerField() + series = ForeignKeyField(Series) + class Meta: - indexes = ((("name", "season"), True),) + indexes = ((("name", "series"), True),) FULL_TEXT_SEARCH = '''SELECT id, text, ts_rank_cd(search_text, query) AS rank diff --git a/phrases.py b/phrases.py index 86e558d..b6299cf 100644 --- a/phrases.py +++ b/phrases.py @@ -12,7 +12,7 @@ from stopwords import STOP_WORDS nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"]) nlp.Defaults.stop_words = STOP_WORDS for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)): - print(f"Campaign {episode.season} Episode {episode.episode_number}") + print(episode.video_number, episode.title) person = None text = "" line_select = Line.select().where(Line.episode == episode) diff --git a/server.py b/server.py index fc7d622..1e9ad7b 100644 --- a/server.py +++ b/server.py @@ -19,20 +19,20 @@ def add_cors(response): return response -global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported] +global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported, Person.series] @app.route("/api/suggest") def question(): query: str = request.args.get('query') until = request.args.get('until') - season = request.args.get('season') - if not query or not until or not season: + series = request.args.get('series') + if not query or not until or not series: return "no suggest query", 400 if len(query) > 50: return "too long query", 400 phrases = Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).where( - (Episode.season == season) & + (Episode.series == series) & (Episode.episode_number <= until) & (Phrase.text.contains(query)) ).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(10) @@ -43,8 +43,8 @@ def question(): def search(): query = request.args.get('query') until = request.args.get('until') - season = request.args.get('season') - if not query or not until or not season: + series = request.args.get('series') + if not query or not until or not series: return "no suggest query", 400 if len(query) > 50: return "too long query", 400 @@ -56,7 +56,7 @@ def search(): & (Episode.episode_number <= until) & - (Episode.season == season) + (Episode.series == series) ).order_by(SQL("rank DESC")).join(Person).switch(Line).join(Episode).limit(20) if len(results) == 0: diff --git a/utils.py b/utils.py index a70b798..dff7022 100644 --- a/utils.py +++ b/utils.py @@ -1,5 +1,8 @@ from datetime import timedelta from pathlib import Path +from typing import Optional + +from data import single_speaker srtdir = Path("./data/subtitles/") @@ -12,5 +15,8 @@ def milliseconds_to_td(ms: int) -> timedelta: return timedelta(milliseconds=ms) -def get_filename(campaign: int, episode: int) -> Path: - return srtdir / f"C{campaign}E{episode}.srt" +def episode_speaker(series_title: str, episode: int) -> Optional[str]: + series = single_speaker[series_title] + if episode in series: + return series[episode] + return None diff --git a/web/src/interfaces.ts b/web/src/interfaces.ts index 9874a6f..2b7ecb0 100644 --- a/web/src/interfaces.ts +++ b/web/src/interfaces.ts @@ -4,10 +4,18 @@ export interface Person { "color": string; } +export interface Series { + "id": number; + "is_campaign": boolean; + "title": string; +} + export interface Episode { "episode_number": number; "id": number; - "season": number; + "series": Series; + "title": string; + "video_number": number; "youtube_id": string; } @@ -34,3 +42,13 @@ export interface ServerMessage { status: string; message: string; } + +export interface SeriesNames { + "id": number; + "title": string; +} + +export interface ServerData { + "series": SeriesNames[]; + +} diff --git a/web/src/router.ts b/web/src/router.ts index 4d564e3..7170728 100644 --- a/web/src/router.ts +++ b/web/src/router.ts @@ -21,7 +21,7 @@ export default new Router({ redirect: "/search/2/10/", }, { - path: "/search/:season/:episode/:keyword?", + path: "/search/:series/:episode/:keyword?", name: "search", component: Home, // props: true, diff --git a/web/src/views/Home.vue b/web/src/views/Home.vue index 73cbb5f..b605865 100644 --- a/web/src/views/Home.vue +++ b/web/src/views/Home.vue @@ -78,15 +78,16 @@ class="form-control" type="number" v-model="episode" min="1" max="300"> in - + {{error.message}}
-
{{episodeName(firstLine(result))}} {{formatTimestamp(firstLine(result).starttime)}}
+
{{formatTimestamp(firstLine(result).starttime)}} {{episodeName(firstLine(result))}}