From 051dac5411bbc43bad4bbd0b0108e302ddcf322d Mon Sep 17 00:00:00 2001 From: Lukas Winkler Date: Mon, 19 Oct 2020 17:43:08 +0200 Subject: [PATCH] add library code --- .gitignore | 1 + paperlibrary/api/api.py | 6 ++- paperlibrary/api/models.py | 11 ++++- paperlibrary/library/__init__.py | 1 + paperlibrary/library/library.py | 78 ++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 paperlibrary/library/__init__.py create mode 100644 paperlibrary/library/library.py diff --git a/.gitignore b/.gitignore index c33e88c..31cafbd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ __pycache__/ library/ +!paperlibrary/library paperlibrary/config.py diff --git a/paperlibrary/api/api.py b/paperlibrary/api/api.py index 58d42ad..e5c49b8 100644 --- a/paperlibrary/api/api.py +++ b/paperlibrary/api/api.py @@ -2,7 +2,7 @@ from typing import List from requests import Session -from paperlibrary.api.models import Author, PDF +from paperlibrary.api.models import Author, PDF, Keyword class PaperLibraryAPI: @@ -17,6 +17,10 @@ class PaperLibraryAPI: r = self.s.get(self.baseURL + "authors/") return Author.schema().loads(r.text, many=True) + def fetch_keywords(self) -> List[Author]: + r = self.s.get(self.baseURL + "keywords/") + return Keyword.schema().loads(r.text, many=True) + def fetch_pdfs(self) -> List[PDF]: r = self.s.get(self.baseURL + "pdfs/") return PDF.schema().loads(r.text, many=True) diff --git a/paperlibrary/api/models.py b/paperlibrary/api/models.py index d49764c..48facf5 100644 --- a/paperlibrary/api/models.py +++ b/paperlibrary/api/models.py @@ -2,7 +2,7 @@ from dataclasses import dataclass, field from datetime import datetime from typing import Optional, List -from dataclasses_json import DataClassJsonMixin, dataclass_json, Undefined, config +from dataclasses_json import DataClassJsonMixin, config from marshmallow import fields @@ -35,7 +35,6 @@ class Paper(DataClassJsonMixin): return self.pdfs[0] -@dataclass_json() @dataclass class Author(DataClassJsonMixin): url: str @@ -43,3 +42,11 @@ class Author(DataClassJsonMixin): name: str affiliation: Optional[str] orcid_id: Optional[str] + + +@dataclass +class Keyword(DataClassJsonMixin): + url: str + papers: List[Paper] + name: str + schema: str diff --git a/paperlibrary/library/__init__.py b/paperlibrary/library/__init__.py new file mode 100644 index 0000000..9a324e5 --- /dev/null +++ b/paperlibrary/library/__init__.py @@ -0,0 +1 @@ +from .library import * diff --git a/paperlibrary/library/library.py b/paperlibrary/library/library.py new file mode 100644 index 0000000..3cde676 --- /dev/null +++ b/paperlibrary/library/library.py @@ -0,0 +1,78 @@ +import hashlib +import os +import shutil +import string +from datetime import datetime +from pathlib import Path + +from tzlocal import get_localzone + +from paperlibrary.api import PaperLibraryAPI +from paperlibrary.config import basedir + + +def format_filename(s: str) -> str: + additional_letters = ["ä", "Ä", "ö", "Ö", "ü", "Ü"] + valid_chars = f"-_.() {string.ascii_letters}{string.digits}{''.join(additional_letters)}" + filename = ''.join(c for c in s if c in valid_chars) + # filename = filename.replace(' ', '_') # I don't like spaces in filenames. + return filename + + +def write_symlinks(api: PaperLibraryAPI): + ... + pdf_dir = basedir / "pdfs" + author_dir = basedir / "by_author" + shutil.rmtree(author_dir, ignore_errors=True) + author_dir.mkdir() + + for author in api.fetch_authors(): + author_subdir = author_dir / format_filename(author.name) + author_subdir.mkdir() + for paper in author.papers: + sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" + targetfile = author_subdir / "{}.pdf".format(format_filename(paper.title)) + targetfile.symlink_to(sourcefile) + + +def download_file(api: PaperLibraryAPI, url: str, target_file: Path): + r = api.s.get(url) + r.raise_for_status() + with target_file.open("wb") as f: + for chunk in r.iter_content(1024): + f.write(chunk) + + +def hash_file(file: Path, buffer_size=65536) -> str: + sha265 = hashlib.sha256() + with file.open("rb") as f: + while True: + data = f.read(buffer_size) + if not data: + break + sha265.update(data) + return sha265.hexdigest() + + +def update_pdfs(api: PaperLibraryAPI): + pdf_dir = basedir / "pdfs" + pdf_dir.mkdir(exist_ok=True) + + for pdf in api.fetch_pdfs(): + pdf_file = pdf_dir / f"{pdf.id}.pdf" + if not pdf_file.exists(): + download_file(api, pdf.file, pdf_file) + continue + if hash_file(pdf_file) != pdf.sha265: + modification_date = datetime.fromtimestamp( + os.path.getmtime(pdf_file), + get_localzone() + ) + print(modification_date) + print(pdf.updated_at) + # print(modification_date - pdf.updated_at) + if modification_date > pdf.updated_at: + raise ValueError("local file is newer") + else: + raise ValueError("remote file is newer") + # TODO: check if file should be uploaded or downloaded