add library code

2024-09-20 17:03:46 +02:00 · 2020-10-19 17:43:08 +02:00 · 2020-10-19 17:43:08 +02:00 · 051dac5411
commit 051dac5411
parent aa495bb0ca
5 changed files with 94 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 .idea/
 __pycache__/
 library/
+!paperlibrary/library
 paperlibrary/config.py
--- a/paperlibrary/api/api.py
+++ b/paperlibrary/api/api.py
@ -2,7 +2,7 @@ from typing import List

 from requests import Session

-from paperlibrary.api.models import Author, PDF
+from paperlibrary.api.models import Author, PDF, Keyword


 class PaperLibraryAPI:
@ -17,6 +17,10 @@ class PaperLibraryAPI:
        r = self.s.get(self.baseURL + "authors/")
        return Author.schema().loads(r.text, many=True)

+    def fetch_keywords(self) -> List[Author]:
+        r = self.s.get(self.baseURL + "keywords/")
+        return Keyword.schema().loads(r.text, many=True)
+
    def fetch_pdfs(self) -> List[PDF]:
        r = self.s.get(self.baseURL + "pdfs/")
        return PDF.schema().loads(r.text, many=True)
--- a/paperlibrary/api/models.py
+++ b/paperlibrary/api/models.py
@ -2,7 +2,7 @@ from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Optional, List

-from dataclasses_json import DataClassJsonMixin, dataclass_json, Undefined, config
+from dataclasses_json import DataClassJsonMixin, config
 from marshmallow import fields


@ -35,7 +35,6 @@ class Paper(DataClassJsonMixin):
        return self.pdfs[0]


-@dataclass_json()
@dataclass
 class Author(DataClassJsonMixin):
    url: str
@ -43,3 +42,11 @@ class Author(DataClassJsonMixin):
    name: str
    affiliation: Optional[str]
    orcid_id: Optional[str]
+
+
+@dataclass
+class Keyword(DataClassJsonMixin):
+    url: str
+    papers: List[Paper]
+    name: str
+    schema: str
--- a/paperlibrary/library/init.py
+++ b/paperlibrary/library/init.py
@ -0,0 +1 @@
+from .library import *
--- a/paperlibrary/library/library.py
+++ b/paperlibrary/library/library.py
@ -0,0 +1,78 @@
+import hashlib
+import os
+import shutil
+import string
+from datetime import datetime
+from pathlib import Path
+
+from tzlocal import get_localzone
+
+from paperlibrary.api import PaperLibraryAPI
+from paperlibrary.config import basedir
+
+
+def format_filename(s: str) -> str:
+    additional_letters = ["ä", "Ä", "ö", "Ö", "ü", "Ü"]
+    valid_chars = f"-_.() {string.ascii_letters}{string.digits}{''.join(additional_letters)}"
+    filename = ''.join(c for c in s if c in valid_chars)
+    # filename = filename.replace(' ', '_')  # I don't like spaces in filenames.
+    return filename
+
+
+def write_symlinks(api: PaperLibraryAPI):
+    ...
+    pdf_dir = basedir / "pdfs"
+    author_dir = basedir / "by_author"
+    shutil.rmtree(author_dir, ignore_errors=True)
+    author_dir.mkdir()
+
+    for author in api.fetch_authors():
+        author_subdir = author_dir / format_filename(author.name)
+        author_subdir.mkdir()
+        for paper in author.papers:
+            sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf"
+            targetfile = author_subdir / "{}.pdf".format(format_filename(paper.title))
+            targetfile.symlink_to(sourcefile)
+
+
+def download_file(api: PaperLibraryAPI, url: str, target_file: Path):
+    r = api.s.get(url)
+    r.raise_for_status()
+    with target_file.open("wb") as f:
+        for chunk in r.iter_content(1024):
+            f.write(chunk)
+
+
+def hash_file(file: Path, buffer_size=65536) -> str:
+    sha265 = hashlib.sha256()
+    with file.open("rb") as f:
+        while True:
+            data = f.read(buffer_size)
+            if not data:
+                break
+            sha265.update(data)
+    return sha265.hexdigest()
+
+
+def update_pdfs(api: PaperLibraryAPI):
+    pdf_dir = basedir / "pdfs"
+    pdf_dir.mkdir(exist_ok=True)
+
+    for pdf in api.fetch_pdfs():
+        pdf_file = pdf_dir / f"{pdf.id}.pdf"
+        if not pdf_file.exists():
+            download_file(api, pdf.file, pdf_file)
+            continue
+        if hash_file(pdf_file) != pdf.sha265:
+            modification_date = datetime.fromtimestamp(
+                os.path.getmtime(pdf_file),
+                get_localzone()
+            )
+            print(modification_date)
+            print(pdf.updated_at)
+            # print(modification_date - pdf.updated_at)
+            if modification_date > pdf.updated_at:
+                raise ValueError("local file is newer")
+            else:
+                raise ValueError("remote file is newer")
+            # TODO: check if file should be uploaded or downloaded