1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-11 06:03:45 +02:00

generalise data model to series

This commit is contained in:
Lukas Winkler 2020-04-15 18:11:45 +02:00
parent 3cd78c3514
commit df3c1b62eb
Signed by: lukas
GPG key ID: 54DE4D798D244853
12 changed files with 165 additions and 97 deletions

View file

@ -1,16 +1,6 @@
from data import colors_c2
from models import Person
colors_c2 = {
"Laura": "#59c3f9",
"Marisha": "#00146e",
"Liam": "#fe8413",
"Taliesin": "#be1c0d",
"Ashley": "#868984",
"Sam": "#dae1dd",
"Travis": "#076708",
"Matt": "#471f0e" # random color
}
p: Person
for p in Person.select():
print(p)

View file

@ -7,5 +7,3 @@ dbauth = {
}
sentryDSN = None
skip_download = False

View file

@ -1,6 +1,6 @@
from sys import argv
from models import db, Phrase, Episode, Person, Line
from models import db, Series, Phrase, Episode, Person, Line
def confirm(message: str) -> None:
@ -14,8 +14,8 @@ mode = argv[1]
if mode == "all":
confirm("Delete all Data? ")
db.drop_tables([Episode, Person, Line, Phrase])
db.create_tables([Episode, Person, Line, Phrase])
db.drop_tables([Series, Episode, Person, Line, Phrase])
db.create_tables([Series, Episode, Person, Line, Phrase])
elif mode == "phrases":
confirm("Delete all Phrases? ")
db.drop_tables([Phrase])

View file

@ -2,25 +2,49 @@ import re
from subprocess import run
import youtube_dl
from peewee import DoesNotExist
import config
from models import Episode
from models import Episode, Series
# https://www.youtube.com/playlist?list=
from utils import srtdir
campaign_playlists = {
1: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_",
2: "https://www.youtube.com/playlist?list=PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2"
}
series_data = [
{
"name": "Campaign 1",
"playlist_id": "PL1tiwbzkOjQz7D0l_eLJGAISVtcL7oRu_",
},
{
"name": "Campaign 2",
"playlist_id": "PL1tiwbzkOjQxD0jjAE7PsWoaCrs0EkBH2"
},
{
"name": "Handbooker Helper",
"playlist_id": "PL1tiwbzkOjQyr6-gqJ8r29j_rJkR49uDN",
"single_speaker": True
}
]
def main():
for campaign in range(1, 3):
for series in series_data:
name = series["name"]
playlist_id = series["playlist_id"]
is_campaign = "Campaign" in name
try:
s = Series.select().where(Series.title == name).get()
except DoesNotExist:
s = Series()
s.title = name
s.is_campaign = is_campaign
s.single_speaker = "single_speaker" in series and series["single_speaker"]
s.save()
ydl_opts = {
'extract_flat': True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
playlist = ydl.extract_info(campaign_playlists[campaign], download=False)
playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False)
videos = playlist["entries"]
print(v["url"] for v in videos)
@ -33,30 +57,42 @@ def main():
regex = re.compile(r"Ep(?:is|si)ode (\d+)")
for nr, video in enumerate(videos, 1):
vttfile = srtdir / f"C{campaign}E{nr}"
ydl_opts["outtmpl"] = str(vttfile)
if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1:
print(f"already imported {vttfile}")
continue
e = Episode()
e.season = campaign
e.video_number = nr
# if Episode.select().where((Episode.season == campaign) & (Episode.video_number == nr)).count() == 1:
# print(f"already imported {vttfile}")
# continue
try:
match = regex.search(video["title"])
e.episode_number = int(match.group(1))
except AttributeError:
if campaign == 1: # one-shots at the end of campaign 1
e.episode_number = e.video_number - 3
else:
raise
e = Episode.select().where((Episode.series == s) & (Episode.video_number == nr)).get()
except DoesNotExist:
e = Episode()
e.series = s
e.video_number = nr
e.title = video["title"]
if s.is_campaign:
try:
match = regex.search(video["title"])
e.episode_number = int(match.group(1))
except AttributeError:
if s.title == "Campaign 1": # one-shots at the end of campaign 1
e.episode_number = e.video_number - 3
else:
raise
else:
e.episode_number = e.video_number
e.youtube_id = video["url"]
e.save()
if config.skip_download:
vttfile = srtdir / str(e.id)
ydl_opts["outtmpl"] = str(vttfile)
if e.downloaded:
continue
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}'])
run(["ffmpeg", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")])
vttfile.with_suffix(".en.vtt").unlink()
e.downloaded = True
try:
vttfile.with_suffix(".en.vtt").unlink()
except FileNotFoundError:
e.downloaded = False
e.save()
if __name__ == '__main__':

View file

@ -5,9 +5,9 @@ from alive_progress import alive_bar
from peewee import fn
from srt import parse
from models import Person, Line, Episode, db
from models import Person, Line, Episode, db, Series
from typo import fix_typo
from utils import td_to_milliseconds, get_filename
from utils import td_to_milliseconds, srtdir, episode_speaker
def is_invalid_name(name: str) -> bool:
@ -19,14 +19,16 @@ def is_invalid_name(name: str) -> bool:
def main():
all_people = set()
for campaign in range(1, 3):
for episode in Episode.select().where((Episode.text_imported == False) & (Episode.season == campaign)):
for series in Series.select():
for episode in Episode.select().where(
(Episode.text_imported == False) & (Episode.series == series) & (Episode.downloaded)
):
with open("names.txt", "w") as f:
f.write("\n".join(sorted(p for p in all_people if "\n" not in p)))
file = get_filename(campaign, episode.video_number)
file = srtdir / f"{episode.id}.srt"
text = file.read_text()
subtitlelines = list(parse(text))
print(episode.video_number, episode.episode_number)
print(episode.video_number, episode.title)
person = None
with db.atomic():
with alive_bar(len(subtitlelines)) as bar:
@ -37,28 +39,32 @@ def main():
assert i == line.index
text = unescape(line.content)
dbline = Line()
if ":" in text:
name, resttext = text.split(":", maxsplit=1)
if name and name[-1].isupper() and not is_invalid_name(name):
people = []
name = name.lower()
for word in re.split('[,&/]|and| an ', name):
word = word.strip()
word = fix_typo(word).title()
word = word.strip()
if word:
people.append(word)
all_people.update(people)
formatted_name = ", ".join(people)
person, created = Person.get_or_create(name=formatted_name, season=campaign)
text = resttext.strip()
if not series.single_speaker:
if ":" in text:
name, resttext = text.split(":", maxsplit=1)
if name and name[-1].isupper() and not is_invalid_name(name):
people = []
name = name.lower()
for word in re.split('[,&/]|and| an ', name):
word = word.strip()
word = fix_typo(word).title()
word = word.strip()
if word:
people.append(word)
all_people.update(people)
formatted_name = ", ".join(people)
person, created = Person.get_or_create(name=formatted_name, series=series)
text = resttext.strip()
else:
if text.startswith("(") and text.endswith(")"):
dbline.isnote = True
person = None
elif text.startswith("[") and text.endswith("]"):
dbline.ismeta = True
person = None
else:
if text.startswith("(") and text.endswith(")"):
dbline.isnote = True
person = None
elif text.startswith("[") and text.endswith("]"):
dbline.ismeta = True
person = None
person_name = episode_speaker(series.title, episode.video_number)
person,created = Person.get_or_create(name=person_name, series=series)
text = text.replace("\n", " ")
dbline.text = text
dbline.search_text = fn.to_tsvector('english', text)

View file

@ -12,16 +12,24 @@ class BaseModel(Model):
database = db
class Series(BaseModel):
title = CharField(max_length=100)
is_campaign = BooleanField()
single_speaker = BooleanField()
class Episode(BaseModel):
season = IntegerField()
series = ForeignKeyField(Series, backref="episodes")
episode_number = IntegerField()
video_number = IntegerField()
youtube_id = CharField(max_length=11)
title = CharField(max_length=100)
downloaded = BooleanField(default=False)
text_imported = BooleanField(default=False)
phrases_imported = BooleanField(default=False)
class Meta:
indexes = ((("season", "video_number"), True),)
indexes = ((("series", "video_number"), True),)
@property
def name(self) -> str:
@ -31,9 +39,10 @@ class Episode(BaseModel):
class Person(BaseModel):
name = CharField()
color = CharField(null=True)
season = IntegerField()
series = ForeignKeyField(Series)
class Meta:
indexes = ((("name", "season"), True),)
indexes = ((("name", "series"), True),)
FULL_TEXT_SEARCH = '''SELECT id, text, ts_rank_cd(search_text, query) AS rank

View file

@ -12,7 +12,7 @@ from stopwords import STOP_WORDS
nlp: English = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.Defaults.stop_words = STOP_WORDS
for episode in Episode.select().where((Episode.phrases_imported == False) & (Episode.text_imported == True)):
print(f"Campaign {episode.season} Episode {episode.episode_number}")
print(episode.video_number, episode.title)
person = None
text = ""
line_select = Line.select().where(Line.episode == episode)

View file

@ -19,20 +19,20 @@ def add_cors(response):
return response
global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported]
global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported, Person.series]
@app.route("/api/suggest")
def question():
query: str = request.args.get('query')
until = request.args.get('until')
season = request.args.get('season')
if not query or not until or not season:
series = request.args.get('series')
if not query or not until or not series:
return "no suggest query", 400
if len(query) > 50:
return "too long query", 400
phrases = Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).where(
(Episode.season == season) &
(Episode.series == series) &
(Episode.episode_number <= until) &
(Phrase.text.contains(query))
).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(10)
@ -43,8 +43,8 @@ def question():
def search():
query = request.args.get('query')
until = request.args.get('until')
season = request.args.get('season')
if not query or not until or not season:
series = request.args.get('series')
if not query or not until or not series:
return "no suggest query", 400
if len(query) > 50:
return "too long query", 400
@ -56,7 +56,7 @@ def search():
&
(Episode.episode_number <= until)
&
(Episode.season == season)
(Episode.series == series)
).order_by(SQL("rank DESC")).join(Person).switch(Line).join(Episode).limit(20)
if len(results) == 0:

View file

@ -1,5 +1,8 @@
from datetime import timedelta
from pathlib import Path
from typing import Optional
from data import single_speaker
srtdir = Path("./data/subtitles/")
@ -12,5 +15,8 @@ def milliseconds_to_td(ms: int) -> timedelta:
return timedelta(milliseconds=ms)
def get_filename(campaign: int, episode: int) -> Path:
return srtdir / f"C{campaign}E{episode}.srt"
def episode_speaker(series_title: str, episode: int) -> Optional[str]:
series = single_speaker[series_title]
if episode in series:
return series[episode]
return None

View file

@ -4,10 +4,18 @@ export interface Person {
"color": string;
}
export interface Series {
"id": number;
"is_campaign": boolean;
"title": string;
}
export interface Episode {
"episode_number": number;
"id": number;
"season": number;
"series": Series;
"title": string;
"video_number": number;
"youtube_id": string;
}
@ -34,3 +42,13 @@ export interface ServerMessage {
status: string;
message: string;
}
export interface SeriesNames {
"id": number;
"title": string;
}
export interface ServerData {
"series": SeriesNames[];
}

View file

@ -21,7 +21,7 @@ export default new Router({
redirect: "/search/2/10/",
},
{
path: "/search/:season/:episode/:keyword?",
path: "/search/:series/:episode/:keyword?",
name: "search",
component: Home,
// props: true,

View file

@ -78,15 +78,16 @@
class="form-control" type="number" v-model="episode"
min="1" max="300">
<span>in</span>
<select title="campaign selection" class="custom-select" v-model="season">
<option value="1">Campaign 1</option>
<option value="2">Campaign 2</option>
<select title="campaign selection" class="custom-select" v-model="series">
<option v-for="series in serverData.series" v-bind:value="series.id">
{{ series.title }}
</option>
</select>
</div>
<b-alert v-if="error" show :variant="error.status">{{error.message}}</b-alert>
<div class="entry" v-for="result in searchResult">
<div class="title">
<div>{{episodeName(firstLine(result))}} {{formatTimestamp(firstLine(result).starttime)}}</div>
<div>{{formatTimestamp(firstLine(result).starttime)}} {{episodeName(firstLine(result))}}</div>
<div class="buttons">
<button class="btn" @click="playVideo(result)" title="View video on YouTube">
<b-icon-play-fill></b-icon-play-fill>
@ -114,7 +115,7 @@
// @ts-ignore
import Autocomplete from "@trevoreyre/autocomplete-vue";
// import "@trevoreyre/autocomplete-vue/dist/style.css";
import {Line, Result, ServerMessage} from "@/interfaces";
import {Line, Result, ServerData, ServerMessage} from "@/interfaces";
import {BAlert, BIcon, BIconPlayFill} from "bootstrap-vue";
// @ts-ignore
import VueYoutube from "vue-youtube";
@ -134,9 +135,10 @@
},
data() {
return {
serverData : require("../../data.json") as ServerData,
searchResult: [] as Result[],
keyword: this.$route.params.keyword,
season: this.$route.params.season,
series: this.$route.params.series,
episode: this.$route.params.episode,
error: undefined as ServerMessage | undefined,
ytOptIn: false,
@ -155,8 +157,8 @@
if (localStorage.ytOptIn) {
this.ytOptIn = localStorage.ytOptIn;
}
if (this.season == null) {
this.season = "2";
if (this.series == null) {
this.series = "2";
}
if (this.episode == null) {
this.episode = "10";
@ -172,7 +174,7 @@
},
methods: {
suggest(input: string) {
const url = baseURL + "suggest?query=" + input + "&until=" + this.episode + "&season=" + this.season;
const url = baseURL + "suggest?query=" + input + "&until=" + this.episode + "&series=" + this.series;
return new Promise((resolve) => {
if (input.length < 1) {
@ -193,7 +195,7 @@
if (!this.keyword) {
return;
}
const url = baseURL + "search?query=" + this.keyword + "&until=" + this.episode + "&season=" + this.season;
const url = baseURL + "search?query=" + this.keyword + "&until=" + this.episode + "&series=" + this.series;
fetch(url)
.then((response) => response.json())
@ -225,7 +227,10 @@
return result.lines[0];
},
episodeName(line: Line): string {
return `C${line.episode.season}E${line.episode.episode_number}`;
if (line.episode.series.is_campaign) {
return `Episode ${line.episode.episode_number}`;
}
return line.episode.title;
},
formatTimestamp(ts: number) {
return new Date(ts).toISOString().substr(11, 8);
@ -301,8 +306,8 @@
// @ts-ignore
this.$router.replace({params: {...this.$route.params, episode: val}});
}, 300),
season(val: string): void {
this.$router.replace({params: {...this.$route.params, season: val}});
series(val: string): void {
this.$router.replace({params: {...this.$route.params, series: val}});
},
keyword(val: string): void {
this.$router.replace({params: {...this.$route.params, keyword: val}});