cr-search/fetch.py

import argparse
import hashlib
import os
from datetime import datetime
from pathlib import Path
from shutil import move
from subprocess import run

import requests
import youtube_dl
from peewee import DoesNotExist

from data import series_data
from models import Episode, Series, Line, Phrase
from utils import srtdir, pretty_title, title_to_episodenumber, clear_cache

static_path = Path("static")


def main(args: argparse.Namespace) -> None:
    os.nice(15)
    for order, series in enumerate(series_data):
        name = series.name
        playlist_id = series.playlist_id
        is_campaign = "Campaign" in name
        try:
            s = Series.select().where(Series.title == name).get()
        except DoesNotExist:
            s = Series()
            s.title = name

        s.is_campaign = is_campaign
        s.single_speaker = series.single_speaker
        s.slug = series.slug
        s.order = order
        s.save()
        ydl_opts = {
            'extract_flat': True
        }
        if playlist_id:
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False)
                videos = playlist["entries"]

            urls = [v["url"] for v in videos]
        elif series.videos:
            urls = series.videos
        else:
            raise ValueError("either set playlist or videos for series")
        ydl_opts_download = {
            "writesubtitles": True,
            "subtitleslangs": ["en", "en-US"],
            "skip_download": True,
        }

        for nr, url in enumerate(urls, 1):
            if nr == 1:
                file = static_path / f"{s.slug}.webp"
                if not file.exists():
                    r = requests.get(f"https://i.ytimg.com/vi_webp/{url}/maxresdefault.webp")
                    r.raise_for_status()
                    with file.open("wb") as f:
                        f.write(r.content)
            changed = False
            try:
                e = Episode.select().where((Episode.youtube_id == url)).get()
                if args.skip_existing and e.downloaded:
                    continue
            except DoesNotExist:
                e = Episode()
                e.series = s
                e.video_number = nr
                changed = True
            e.youtube_id = url
            video_info = ydl.extract_info(f'https://www.youtube.com/watch?v={e.youtube_id}', download=False)
            e.upload_date = datetime.strptime(video_info["upload_date"], "%Y%m%d")
            e.title = video_info["title"]
            if e.title in ["Exandria Unlimited: Meet the Players"]:
                continue
            e.pretty_title = pretty_title(video_info["title"])
            if s.is_campaign or "Exandria" in e.title:
                if e.series.id == 1 and ("One-Shot" in e.title or "Search For Bob" in e.title):
                    continue
                e.episode_number = title_to_episodenumber(e.title, e.video_number)
            else:
                e.episode_number = e.video_number
            e.save()
            print(e.series.id, e.episode_number, e.pretty_title)

            vttfile = srtdir / str(e.id)
            ydl_opts_download["outtmpl"] = str(vttfile)
            with youtube_dl.YoutubeDL(ydl_opts_download) as ydl:
                ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}'])
            if vttfile.with_suffix(".en-US.vtt").exists():
                # few videos have en-US as language code instead of en
                move(vttfile.with_suffix(".en-US.vtt"), vttfile.with_suffix(".en.vtt"))
            output = run(
                ["ffmpeg", "-y", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")],
                capture_output=True
            )
            e.downloaded = True
            try:
                vttfile.with_suffix(".en.vtt").unlink()
                with vttfile.with_suffix(".srt").open("rb") as f:
                    file_hash = hashlib.sha256()
                    while True:
                        chunk = f.read(8192)
                        if not chunk:
                            break
                        file_hash.update(chunk)
                if e.subtitle_hash != file_hash.hexdigest():
                    print("subtitle hash changed, deleting imported data")
                    Line.delete().where(Line.episode == e)
                    Phrase.delete().where(Phrase.episode == e)
                    e.phrases_imported = False
                    e.text_imported = False
                    e.subtitle_hash = file_hash.hexdigest()
                    e.last_updated = datetime.now()
                    changed = True
            except FileNotFoundError:
                e.downloaded = False
            e.save()
            if changed:
                clear_cache()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="fetch episode data from YouTube")
    parser.add_argument("--skip-existing", dest="skip_existing", action="store_true",
                        help="don't check for update on existing videos")
    args = parser.parse_args()
    main(args)
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`import argparse`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`import hashlib`
be nice 2020-08-08 15:26:13 +02:00			`import os`
update last_updated 2020-08-11 12:19:01 +02:00			`from datetime import datetime`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`from pathlib import Path`
also allow en-US subtitles 2020-08-11 11:53:23 +02:00			`from shutil import move`
initial commit 2020-03-07 10:45:39 +01:00			`from subprocess import run`

add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`import requests`
initial commit 2020-03-07 10:45:39 +01:00			`import youtube_dl`
generalise data model to series 2020-04-15 18:11:45 +02:00			`from peewee import DoesNotExist`
initial commit 2020-03-07 10:45:39 +01:00
add Mini Primetime 2020-08-13 17:11:18 +02:00			`from data import series_data`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`from models import Episode, Series, Line, Phrase`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`from utils import srtdir, pretty_title, title_to_episodenumber, clear_cache`
initial commit 2020-03-07 10:45:39 +01:00
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`static_path = Path("static")`
many major changes 2020-03-08 18:48:14 +01:00
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00
typing 2021-07-15 20:26:33 +02:00			`def main(args: argparse.Namespace) -> None:`
be nice 2020-08-08 15:26:13 +02:00			`os.nice(15)`
add more series 2021-10-26 15:30:14 +02:00			`for order, series in enumerate(series_data):`
better typing 2020-08-30 22:11:28 +02:00			`name = series.name`
			`playlist_id = series.playlist_id`
generalise data model to series 2020-04-15 18:11:45 +02:00			`is_campaign = "Campaign" in name`
			`try:`
			`s = Series.select().where(Series.title == name).get()`
			`except DoesNotExist:`
			`s = Series()`
			`s.title = name`

			`s.is_campaign = is_campaign`
better typing 2020-08-30 22:11:28 +02:00			`s.single_speaker = series.single_speaker`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`s.slug = series.slug`
add more series 2021-10-26 15:30:14 +02:00			`s.order = order`
generalise data model to series 2020-04-15 18:11:45 +02:00			`s.save()`
many major changes 2020-03-08 18:48:14 +01:00			`ydl_opts = {`
			`'extract_flat': True`
			`}`
typing 2021-07-15 20:26:33 +02:00			`if playlist_id:`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`with youtube_dl.YoutubeDL(ydl_opts) as ydl:`
			`playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False)`
			`videos = playlist["entries"]`
many major changes 2020-03-08 18:48:14 +01:00
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`urls = [v["url"] for v in videos]`
typing 2021-07-15 20:26:33 +02:00			`elif series.videos:`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`urls = series.videos`
typing 2021-07-15 20:26:33 +02:00			`else:`
			`raise ValueError("either set playlist or videos for series")`
better typing 2020-08-30 22:11:28 +02:00			`ydl_opts_download = {`
many major changes 2020-03-08 18:48:14 +01:00			`"writesubtitles": True,`
also allow en-US subtitles 2020-08-11 11:53:23 +02:00			`"subtitleslangs": ["en", "en-US"],`
many major changes 2020-03-08 18:48:14 +01:00			`"skip_download": True,`
			`}`

add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`for nr, url in enumerate(urls, 1):`
always check for image download 2021-07-17 18:58:18 +02:00			`if nr == 1:`
			`file = static_path / f"{s.slug}.webp"`
			`if not file.exists():`
			`r = requests.get(f"https://i.ytimg.com/vi_webp/{url}/maxresdefault.webp")`
			`r.raise_for_status()`
add more series 2021-10-26 15:30:14 +02:00			`with file.open("wb") as f:`
always check for image download 2021-07-17 18:58:18 +02:00			`f.write(r.content)`
only clear cache if things changed 2021-07-07 20:08:23 +02:00			`changed = False`
many major changes 2020-03-08 18:48:14 +01:00			`try:`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`e = Episode.select().where((Episode.youtube_id == url)).get()`
			`if args.skip_existing and e.downloaded:`
			`continue`
generalise data model to series 2020-04-15 18:11:45 +02:00			`except DoesNotExist:`
			`e = Episode()`
			`e.series = s`
			`e.video_number = nr`
only clear cache if things changed 2021-07-07 20:08:23 +02:00			`changed = True`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`e.youtube_id = url`
			`video_info = ydl.extract_info(f'https://www.youtube.com/watch?v={e.youtube_id}', download=False)`
			`e.upload_date = datetime.strptime(video_info["upload_date"], "%Y%m%d")`
			`e.title = video_info["title"]`
EU import fix 2021-07-28 14:37:27 +02:00			`if e.title in ["Exandria Unlimited: Meet the Players"]:`
			`continue`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`e.pretty_title = pretty_title(video_info["title"])`
			`if s.is_campaign or "Exandria" in e.title:`
			`if e.series.id == 1 and ("One-Shot" in e.title or "Search For Bob" in e.title):`
skip season 1 one-shots 2021-06-06 18:55:09 +02:00			`continue`
some minor updates 2021-05-25 20:53:09 +02:00			`e.episode_number = title_to_episodenumber(e.title, e.video_number)`
generalise data model to series 2020-04-15 18:11:45 +02:00			`else:`
			`e.episode_number = e.video_number`
many major changes 2020-03-08 18:48:14 +01:00			`e.save()`
improve fetching data (and logging) 2020-08-30 21:33:23 +02:00			`print(e.series.id, e.episode_number, e.pretty_title)`

generalise data model to series 2020-04-15 18:11:45 +02:00			`vttfile = srtdir / str(e.id)`
better typing 2020-08-30 22:11:28 +02:00			`ydl_opts_download["outtmpl"] = str(vttfile)`
			`with youtube_dl.YoutubeDL(ydl_opts_download) as ydl:`
many major changes 2020-03-08 18:48:14 +01:00			`ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}'])`
also allow en-US subtitles 2020-08-11 11:53:23 +02:00			`if vttfile.with_suffix(".en-US.vtt").exists():`
			`# few videos have en-US as language code instead of en`
			`move(vttfile.with_suffix(".en-US.vtt"), vttfile.with_suffix(".en.vtt"))`
improve fetching data (and logging) 2020-08-30 21:33:23 +02:00			`output = run(`
			`["ffmpeg", "-y", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")],`
			`capture_output=True`
			`)`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`e.downloaded = True`
			`try:`
			`vttfile.with_suffix(".en.vtt").unlink()`
			`with vttfile.with_suffix(".srt").open("rb") as f:`
			`file_hash = hashlib.sha256()`
			`while True:`
			`chunk = f.read(8192)`
			`if not chunk:`
			`break`
			`file_hash.update(chunk)`
			`if e.subtitle_hash != file_hash.hexdigest():`
improve fetching data (and logging) 2020-08-30 21:33:23 +02:00			`print("subtitle hash changed, deleting imported data")`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`Line.delete().where(Line.episode == e)`
			`Phrase.delete().where(Phrase.episode == e)`
			`e.phrases_imported = False`
			`e.text_imported = False`
			`e.subtitle_hash = file_hash.hexdigest()`
update last_updated 2020-08-11 12:19:01 +02:00			`e.last_updated = datetime.now()`
only clear cache if things changed 2021-07-07 20:08:23 +02:00			`changed = True`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`except FileNotFoundError:`
			`e.downloaded = False`
			`e.save()`
only clear cache if things changed 2021-07-07 20:08:23 +02:00			`if changed:`
			`clear_cache()`

many major changes 2020-03-08 18:48:14 +01:00
			`if __name__ == '__main__':`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`parser = argparse.ArgumentParser(description="fetch episode data from YouTube")`
			`parser.add_argument("--skip-existing", dest="skip_existing", action="store_true",`
			`help="don't check for update on existing videos")`
			`args = parser.parse_args()`
			`main(args)`