1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00
cr-search/fetch.py

133 lines
5.1 KiB
Python
Raw Normal View History

import argparse
import hashlib
2020-08-08 15:26:13 +02:00
import os
2020-08-11 12:19:01 +02:00
from datetime import datetime
from pathlib import Path
2020-08-11 11:53:23 +02:00
from shutil import move
2020-03-07 10:45:39 +01:00
from subprocess import run
import requests
2020-03-07 10:45:39 +01:00
import youtube_dl
2020-04-15 18:11:45 +02:00
from peewee import DoesNotExist
2020-03-07 10:45:39 +01:00
2020-08-13 17:11:18 +02:00
from data import series_data
from models import Episode, Series, Line, Phrase
2021-07-07 17:27:21 +02:00
from utils import srtdir, pretty_title, title_to_episodenumber, clear_cache
2020-03-07 10:45:39 +01:00
static_path = Path("static")
2020-03-08 18:48:14 +01:00
2021-07-15 20:26:33 +02:00
def main(args: argparse.Namespace) -> None:
2020-08-08 15:26:13 +02:00
os.nice(15)
2021-10-26 15:30:14 +02:00
for order, series in enumerate(series_data):
2020-08-30 22:11:28 +02:00
name = series.name
playlist_id = series.playlist_id
2020-04-15 18:11:45 +02:00
is_campaign = "Campaign" in name
try:
s = Series.select().where(Series.title == name).get()
except DoesNotExist:
s = Series()
s.title = name
s.is_campaign = is_campaign
2020-08-30 22:11:28 +02:00
s.single_speaker = series.single_speaker
s.slug = series.slug
2021-10-26 15:30:14 +02:00
s.order = order
2020-04-15 18:11:45 +02:00
s.save()
2020-03-08 18:48:14 +01:00
ydl_opts = {
'extract_flat': True
}
2021-07-15 20:26:33 +02:00
if playlist_id:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
playlist = ydl.extract_info("https://www.youtube.com/playlist?list=" + playlist_id, download=False)
videos = playlist["entries"]
2020-03-08 18:48:14 +01:00
urls = [v["url"] for v in videos]
2021-07-15 20:26:33 +02:00
elif series.videos:
urls = series.videos
2021-07-15 20:26:33 +02:00
else:
raise ValueError("either set playlist or videos for series")
2020-08-30 22:11:28 +02:00
ydl_opts_download = {
2020-03-08 18:48:14 +01:00
"writesubtitles": True,
2020-08-11 11:53:23 +02:00
"subtitleslangs": ["en", "en-US"],
2020-03-08 18:48:14 +01:00
"skip_download": True,
}
for nr, url in enumerate(urls, 1):
2021-07-17 18:58:18 +02:00
if nr == 1:
file = static_path / f"{s.slug}.webp"
if not file.exists():
r = requests.get(f"https://i.ytimg.com/vi_webp/{url}/maxresdefault.webp")
r.raise_for_status()
2021-10-26 15:30:14 +02:00
with file.open("wb") as f:
2021-07-17 18:58:18 +02:00
f.write(r.content)
2021-07-07 20:08:23 +02:00
changed = False
2020-03-08 18:48:14 +01:00
try:
e = Episode.select().where((Episode.youtube_id == url)).get()
if args.skip_existing and e.downloaded:
continue
2020-04-15 18:11:45 +02:00
except DoesNotExist:
e = Episode()
e.series = s
e.video_number = nr
2021-07-07 20:08:23 +02:00
changed = True
e.youtube_id = url
video_info = ydl.extract_info(f'https://www.youtube.com/watch?v={e.youtube_id}', download=False)
e.upload_date = datetime.strptime(video_info["upload_date"], "%Y%m%d")
e.title = video_info["title"]
2021-07-28 14:37:27 +02:00
if e.title in ["Exandria Unlimited: Meet the Players"]:
continue
e.pretty_title = pretty_title(video_info["title"])
if s.is_campaign or "Exandria" in e.title:
if e.series.id == 1 and ("One-Shot" in e.title or "Search For Bob" in e.title):
2021-06-06 18:55:09 +02:00
continue
2021-05-25 20:53:09 +02:00
e.episode_number = title_to_episodenumber(e.title, e.video_number)
2020-04-15 18:11:45 +02:00
else:
e.episode_number = e.video_number
2020-03-08 18:48:14 +01:00
e.save()
2020-08-30 21:33:23 +02:00
print(e.series.id, e.episode_number, e.pretty_title)
2020-04-15 18:11:45 +02:00
vttfile = srtdir / str(e.id)
2020-08-30 22:11:28 +02:00
ydl_opts_download["outtmpl"] = str(vttfile)
with youtube_dl.YoutubeDL(ydl_opts_download) as ydl:
2020-03-08 18:48:14 +01:00
ydl.download([f'https://www.youtube.com/watch?v={e.youtube_id}'])
2020-08-11 11:53:23 +02:00
if vttfile.with_suffix(".en-US.vtt").exists():
# few videos have en-US as language code instead of en
move(vttfile.with_suffix(".en-US.vtt"), vttfile.with_suffix(".en.vtt"))
2020-08-30 21:33:23 +02:00
output = run(
["ffmpeg", "-y", "-i", vttfile.with_suffix(".en.vtt"), vttfile.with_suffix(".srt")],
capture_output=True
)
e.downloaded = True
try:
vttfile.with_suffix(".en.vtt").unlink()
with vttfile.with_suffix(".srt").open("rb") as f:
file_hash = hashlib.sha256()
while True:
chunk = f.read(8192)
if not chunk:
break
file_hash.update(chunk)
if e.subtitle_hash != file_hash.hexdigest():
2020-08-30 21:33:23 +02:00
print("subtitle hash changed, deleting imported data")
Line.delete().where(Line.episode == e)
Phrase.delete().where(Phrase.episode == e)
e.phrases_imported = False
e.text_imported = False
e.subtitle_hash = file_hash.hexdigest()
2020-08-11 12:19:01 +02:00
e.last_updated = datetime.now()
2021-07-07 20:08:23 +02:00
changed = True
except FileNotFoundError:
e.downloaded = False
e.save()
2021-07-07 20:08:23 +02:00
if changed:
clear_cache()
2020-03-08 18:48:14 +01:00
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="fetch episode data from YouTube")
parser.add_argument("--skip-existing", dest="skip_existing", action="store_true",
help="don't check for update on existing videos")
args = parser.parse_args()
main(args)