1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00

generalise data model to series

This commit is contained in:
Lukas Winkler 2020-04-15 18:11:45 +02:00
parent 3cd78c3514
commit df3c1b62eb
Signed by: lukas
GPG key ID: 54DE4D798D244853
12 changed files with 165 additions and 97 deletions

View file

@ -1,16 +1,6 @@
from data import colors_c2
from models import Person from models import Person
colors_c2 = {
"Laura": "#59c3f9",
"Marisha": "#00146e",
"Liam": "#fe8413",
"Taliesin": "#be1c0d",
"Ashley": "#868984",
"Sam": "#dae1dd",
"Travis": "#076708",
"Matt": "#471f0e" # random color
}
p: Person p: Person
for p in Person.select(): for p in Person.select():
print(p) print(p)

View file

@ -7,5 +7,3 @@ dbauth = {
} }
sentryDSN = None sentryDSN = None
skip_download = False

View file

@ -1,6 +1,6 @@
from sys import argv from sys import argv
from models import db, Phrase, Episode, Person, Line from models import db, Series, Phrase, Episode, Person, Line
def confirm(message: str) -> None: def confirm(message: str) -> None:
@ -14,8 +14,8 @@ mode = argv[1]
if mode == "all": if mode == "all":
confirm("Delete all Data? ") confirm("Delete all Data? ")
db.drop_tables([Episode, Person, Line, Phrase]) db.drop_tables([Series, Episode, Person, Line, Phrase])
db.create_tables([Episode, Person, Line, Phrase]) db.create_tables([Series, Episode, Person, Line, Phrase])
elif mode == "phrases": elif mode == "phrases":
confirm("Delete all Phrases? ") confirm("Delete all Phrases? ")
db.drop_tables([Phrase]) db.drop_tables([Phrase])

View file

@ -2,25 +2,49 @@ import re
from subprocess import run from subprocess import run
import youtube_dl import youtube_dl
from peewee import DoesNotExist
import config from models import Episode, Series
from models import Episode # https://www.youtube.com/playlist?list=
from utils import srtdir from utils import srtdir
campaign_playlists = { series_data = [
1: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_", {
2: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2" "name": "Campaign 1",
} "playlist_id": "PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_",
},
{
"name": "Campaign 2",
"playlist_id": "PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2"
},
{
"name": "Handbooker Helper",
"playlist_id": "PL1tiwbzkOjQyr6-gqJ8r29j_rJkR49uDN",
"single_speaker": True
}
]
def main(): def main():
for campaign in range(1, 3): for series in series_data:
name = series["name"]
playlist_id = series["playlist_id"]
is_campaign = "Campaign" in name
try:
s = Series.select().where(Series.title == name).get()
except DoesNotExist:
s = Series()
s.title = name
s.is_campaign = is_campaign
s.single_speaker = "single_speaker" in series and series["single_speaker"]
s.save()
ydl_opts = { ydl_opts = {
'extract_flat': True 'extract_flat': True
} }
with youtube_dl.YoutubeDL(ydl_opts) as ydl: with youtube_dl.YoutubeDL(ydl_opts) as ydl:
playlist = ydl.extract_info(campaign_playlists[campaign], download=False) playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False)
videos = playlist["entries"] videos = playlist["entries"]
print(v["url"] for v in videos) print(v["url"] for v in videos)
@ -33,30 +57,42 @@ def main():
regex = re.compile(r"Ep(?:is|si)ode (\d+)") regex = re.compile(r"Ep(?:is|si)ode (\d+)")
for nr, video in enumerate(videos, 1): for nr, video in enumerate(videos, 1):
vttfile = srtdir / f"C{campaign}E{nr}" # if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1:
ydl_opts["outtmpl"] = str(vttfile) # print(f"already imported {vttfile}")
if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1: # continue
print(f"already imported {vttfile}")
continue
e = Episode()
e.season = campaign
e.video_number = nr
try: try:
match = regex.search(video["title"]) e = Episode.select().where((Episode.series == s) & (Episode.video_number == nr)).get()
e.episode_number = int(match.group(1)) except DoesNotExist:
except AttributeError: e = Episode()
if campaign == 1: # one-shots at the end of campaign 1 e.series = s
e.episode_number = e.video_number - 3 e.video_number = nr
else: e.title = video["title"]
raise if s.is_campaign:
try:
match = regex.search(video["title"])
e.episode_number = int(match.group(1))
except AttributeError:
if s.title == "Campaign 1": # one-shots at the end of campaign 1
e.episode_number = e.video_number - 3
else:
raise
else:
e.episode_number = e.video_number
e.youtube_id = video["url"] e.youtube_id = video["url"]
e.save() e.save()
if config.skip_download: vttfile = srtdir / str(e.id)
ydl_opts["outtmpl"] = str(vttfile)
if e.downloaded:
continue continue
with youtube_dl.YoutubeDL(ydl_opts) as ydl: with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}']) ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}'])
run(["ffmpeg", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")]) run(["ffmpeg", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")])
vttfile.with_suffix(".en.vtt").unlink() e.downloaded = True
try:
vttfile.with_suffix(".en.vtt").unlink()
except FileNotFoundError:
e.downloaded = False
e.save()
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -5,9 +5,9 @@ from alive_progress import alive_bar
from peewee import fn from peewee import fn
from srt import parse from srt import parse
from models import Person, Line, Episode, db from models import Person, Line, Episode, db, Series
from typo import fix_typo from typo import fix_typo
from utils import td_to_milliseconds, get_filename from utils import td_to_milliseconds, srtdir, episode_speaker
def is_invalid_name(name: str) -> bool: def is_invalid_name(name: str) -> bool:
@ -19,14 +19,16 @@ def is_invalid_name(name: str) -> bool:
def main(): def main():
all_people = set() all_people = set()
for campaign in range(1, 3): for series in Series.select():
for episode in Episode.select().where((Episode.text_imported == False) & (Episode.season == campaign)): for episode in Episode.select().where(
(Episode.text_imported == False) & (Episode.series == series) & (Episode.downloaded)
):
with open("names.txt", "w") as f: with open("names.txt", "w") as f:
f.write("\n".join(sorted(p for p in all_people if "\n" not in p))) f.write("\n".join(sorted(p for p in all_people if "\n" not in p)))
file = get_filename(campaign, episode.video_number) file = srtdir / f"{episode.id}.srt"
text = file.read_text() text = file.read_text()
subtitlelines = list(parse(text)) subtitlelines = list(parse(text))
print(episode.video_number, episode.episode_number) print(episode.video_number, episode.title)
person = None person = None
with db.atomic(): with db.atomic():
with alive_bar(len(subtitlelines)) as bar: with alive_bar(len(subtitlelines)) as bar:
@ -37,28 +39,32 @@ def main():
assert i == line.index assert i == line.index
text = unescape(line.content) text = unescape(line.content)
dbline = Line() dbline = Line()
if ":" in text: if not series.single_speaker:
name, resttext = text.split(":", maxsplit=1) if ":" in text:
if name and name[-1].isupper() and not is_invalid_name(name): name, resttext = text.split(":", maxsplit=1)
people = [] if name and name[-1].isupper() and not is_invalid_name(name):
name = name.lower() people = []
for word in re.split('[,&/]|and| an ', name): name = name.lower()
word = word.strip() for word in re.split('[,&/]|and| an ', name):
word = fix_typo(word).title() word = word.strip()
word = word.strip() word = fix_typo(word).title()
if word: word = word.strip()
people.append(word) if word:
all_people.update(people) people.append(word)
formatted_name = ", ".join(people) all_people.update(people)
person, created = Person.get_or_create(name=formatted_name, season=campaign) formatted_name = ", ".join(people)
text = resttext.strip() person, created = Person.get_or_create(name=formatted_name, series=series)
text = resttext.strip()
else:
if text.startswith("(") and text.endswith(")"):
dbline.isnote = True
person = None
elif text.startswith("[") and text.endswith("]"):
dbline.ismeta = True
person = None
else: else:
if text.startswith("(") and text.endswith(")"): person_name = episode_speaker(series.title, episode.video_number)
dbline.isnote = True person,created = Person.get_or_create(name=person_name, series=series)
person = None
elif text.startswith("[") and text.endswith("]"):
dbline.ismeta = True
person = None
text = text.replace("\n", " ") text = text.replace("\n", " ")
dbline.text = text dbline.text = text
dbline.search_text = fn.to_tsvector('english', text) dbline.search_text = fn.to_tsvector('english', text)

View file

@ -12,16 +12,24 @@ class BaseModel(Model):
database = db database = db
class Series(BaseModel):
title = CharField(max_length=100)
is_campaign = BooleanField()
single_speaker = BooleanField()
class Episode(BaseModel): class Episode(BaseModel):
season = IntegerField() series = ForeignKeyField(Series, backref="episodes")
episode_number = IntegerField() episode_number = IntegerField()
video_number = IntegerField() video_number = IntegerField()
youtube_id = CharField(max_length=11) youtube_id = CharField(max_length=11)
title = CharField(max_length=100)
downloaded = BooleanField(default=False)
text_imported = BooleanField(default=False) text_imported = BooleanField(default=False)
phrases_imported = BooleanField(default=False) phrases_imported = BooleanField(default=False)
class Meta: class Meta:
indexes = ((("season", "video_number"), True),) indexes = ((("series", "video_number"), True),)
@property @property
def name(self) -> str: def name(self) -> str:
@ -31,9 +39,10 @@ class Episode(BaseModel):
class Person(BaseModel): class Person(BaseModel):
name = CharField() name = CharField()
color = CharField(null=True) color = CharField(null=True)
season = IntegerField() series = ForeignKeyField(Series)
class Meta: class Meta:
indexes = ((("name", "season"), True),) indexes = ((("name", "series"), True),)
FULL_TEXT_SEARCH = '''SELECT id, text, ts_rank_cd(search_text, query) AS rank FULL_TEXT_SEARCH = '''SELECT id, text, ts_rank_cd(search_text, query) AS rank

View file

@ -12,7 +12,7 @@ from stopwords import STOP_WORDS
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"]) nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)): for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):
print(f"Campaign {episode.season} Episode {episode.episode_number}") print(episode.video_number, episode.title)
person = None person = None
text = "" text = ""
line_select = Line.select().where(Line.episode == episode) line_select = Line.select().where(Line.episode == episode)

View file

@ -19,20 +19,20 @@ def add_cors(response):
return response return response
global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported] global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported, Person.series]
@app.route("/api/suggest") @app.route("/api/suggest")
def question(): def question():
query: str = request.args.get('query') query: str = request.args.get('query')
until = request.args.get('until') until = request.args.get('until')
season = request.args.get('season') series = request.args.get('series')
if not query or not until or not season: if not query or not until or not series:
return "no suggest query", 400 return "no suggest query", 400
if len(query) > 50: if len(query) > 50:
return "too long query", 400 return "too long query", 400
phrases = Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).where( phrases = Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).where(
(Episode.season == season) & (Episode.series == series) &
(Episode.episode_number <= until) & (Episode.episode_number <= until) &
(Phrase.text.contains(query)) (Phrase.text.contains(query))
).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(10) ).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(10)
@ -43,8 +43,8 @@ def question():
def search(): def search():
query = request.args.get('query') query = request.args.get('query')
until = request.args.get('until') until = request.args.get('until')
season = request.args.get('season') series = request.args.get('series')
if not query or not until or not season: if not query or not until or not series:
return "no suggest query", 400 return "no suggest query", 400
if len(query) > 50: if len(query) > 50:
return "too long query", 400 return "too long query", 400
@ -56,7 +56,7 @@ def search():
& &
(Episode.episode_number <= until) (Episode.episode_number <= until)
& &
(Episode.season == season) (Episode.series == series)
).order_by(SQL("rank DESC")).join(Person).switch(Line).join(Episode).limit(20) ).order_by(SQL("rank DESC")).join(Person).switch(Line).join(Episode).limit(20)
if len(results) == 0: if len(results) == 0:

View file

@ -1,5 +1,8 @@
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import Optional
from data import single_speaker
srtdir = Path("./data/subtitles/") srtdir = Path("./data/subtitles/")
@ -12,5 +15,8 @@ def milliseconds_to_td(ms: int) -> timedelta:
return timedelta(milliseconds=ms) return timedelta(milliseconds=ms)
def get_filename(campaign: int, episode: int) -> Path: def episode_speaker(series_title: str, episode: int) -> Optional[str]:
return srtdir / f"C{campaign}E{episode}.srt" series = single_speaker[series_title]
if episode in series:
return series[episode]
return None

View file

@ -4,10 +4,18 @@ export interface Person {
"color": string; "color": string;
} }
export interface Series {
"id": number;
"is_campaign": boolean;
"title": string;
}
export interface Episode { export interface Episode {
"episode_number": number; "episode_number": number;
"id": number; "id": number;
"season": number; "series": Series;
"title": string;
"video_number": number;
"youtube_id": string; "youtube_id": string;
} }
@ -34,3 +42,13 @@ export interface ServerMessage {
status: string; status: string;
message: string; message: string;
} }
export interface SeriesNames {
"id": number;
"title": string;
}
export interface ServerData {
"series": SeriesNames[];
}

View file

@ -21,7 +21,7 @@ export default new Router({
redirect: "/search/2/10/", redirect: "/search/2/10/",
}, },
{ {
path: "/search/:season/:episode/:keyword?", path: "/search/:series/:episode/:keyword?",
name: "search", name: "search",
component: Home, component: Home,
// props: true, // props: true,

View file

@ -78,15 +78,16 @@
class="form-control" type="number" v-model="episode" class="form-control" type="number" v-model="episode"
min="1" max="300"> min="1" max="300">
<span>in</span> <span>in</span>
<select title="campaign selection" class="custom-select" v-model="season"> <select title="campaign selection" class="custom-select" v-model="series">
<option value="1">Campaign 1</option> <option v-for="series in serverData.series" v-bind:value="series.id">
<option value="2">Campaign 2</option> {{ series.title }}
</option>
</select> </select>
</div> </div>
<b-alert v-if="error" show :variant="error.status">{{error.message}}</b-alert> <b-alert v-if="error" show :variant="error.status">{{error.message}}</b-alert>
<div class="entry" v-for="result in searchResult"> <div class="entry" v-for="result in searchResult">
<div class="title"> <div class="title">
<div>{{episodeName(firstLine(result))}} {{formatTimestamp(firstLine(result).starttime)}}</div> <div>{{formatTimestamp(firstLine(result).starttime)}} {{episodeName(firstLine(result))}}</div>
<div class="buttons"> <div class="buttons">
<button class="btn" @click="playVideo(result)" title="View video on YouTube"> <button class="btn" @click="playVideo(result)" title="View video on YouTube">
<b-icon-play-fill></b-icon-play-fill> <b-icon-play-fill></b-icon-play-fill>
@ -114,7 +115,7 @@
// @ts-ignore // @ts-ignore
import Autocomplete from "@trevoreyre/autocomplete-vue"; import Autocomplete from "@trevoreyre/autocomplete-vue";
// import "@trevoreyre/autocomplete-vue/dist/style.css"; // import "@trevoreyre/autocomplete-vue/dist/style.css";
import {Line, Result, ServerMessage} from "@/interfaces"; import {Line, Result, ServerData, ServerMessage} from "@/interfaces";
import {BAlert, BIcon, BIconPlayFill} from "bootstrap-vue"; import {BAlert, BIcon, BIconPlayFill} from "bootstrap-vue";
// @ts-ignore // @ts-ignore
import VueYoutube from "vue-youtube"; import VueYoutube from "vue-youtube";
@ -134,9 +135,10 @@
}, },
data() { data() {
return { return {
serverData : require("../../data.json") as ServerData,
searchResult: [] as Result[], searchResult: [] as Result[],
keyword: this.$route.params.keyword, keyword: this.$route.params.keyword,
season: this.$route.params.season, series: this.$route.params.series,
episode: this.$route.params.episode, episode: this.$route.params.episode,
error: undefined as ServerMessage | undefined, error: undefined as ServerMessage | undefined,
ytOptIn: false, ytOptIn: false,
@ -155,8 +157,8 @@
if (localStorage.ytOptIn) { if (localStorage.ytOptIn) {
this.ytOptIn = localStorage.ytOptIn; this.ytOptIn = localStorage.ytOptIn;
} }
if (this.season == null) { if (this.series == null) {
this.season = "2"; this.series = "2";
} }
if (this.episode == null) { if (this.episode == null) {
this.episode = "10"; this.episode = "10";
@ -172,7 +174,7 @@
}, },
methods: { methods: {
suggest(input: string) { suggest(input: string) {
const url = baseURL + "suggest?query=" + input + "&until=" + this.episode + "&season=" + this.season; const url = baseURL + "suggest?query=" + input + "&until=" + this.episode + "&series=" + this.series;
return new Promise((resolve) => { return new Promise((resolve) => {
if (input.length < 1) { if (input.length < 1) {
@ -193,7 +195,7 @@
if (!this.keyword) { if (!this.keyword) {
return; return;
} }
const url = baseURL + "search?query=" + this.keyword + "&until=" + this.episode + "&season=" + this.season; const url = baseURL + "search?query=" + this.keyword + "&until=" + this.episode + "&series=" + this.series;
fetch(url) fetch(url)
.then((response) => response.json()) .then((response) => response.json())
@ -225,7 +227,10 @@
return result.lines[0]; return result.lines[0];
}, },
episodeName(line: Line): string { episodeName(line: Line): string {
return `C${line.episode.season}E${line.episode.episode_number}`; if (line.episode.series.is_campaign) {
return `Episode ${line.episode.episode_number}`;
}
return line.episode.title;
}, },
formatTimestamp(ts: number) { formatTimestamp(ts: number) {
return new Date(ts).toISOString().substr(11, 8); return new Date(ts).toISOString().substr(11, 8);
@ -301,8 +306,8 @@
// @ts-ignore // @ts-ignore
this.$router.replace({params: {...this.$route.params, episode: val}}); this.$router.replace({params: {...this.$route.params, episode: val}});
}, 300), }, 300),
season(val: string): void { series(val: string): void {
this.$router.replace({params: {...this.$route.params, season: val}}); this.$router.replace({params: {...this.$route.params, series: val}});
}, },
keyword(val: string): void { keyword(val: string): void {
this.$router.replace({params: {...this.$route.params, keyword: val}}); this.$router.replace({params: {...this.$route.params, keyword: val}});