diff --git a/paperlibrary/api/api.py b/paperlibrary/api/api.py index e5c49b8..ccda532 100644 --- a/paperlibrary/api/api.py +++ b/paperlibrary/api/api.py @@ -1,8 +1,9 @@ -from typing import List +from pathlib import Path +from typing import List, Dict from requests import Session -from paperlibrary.api.models import Author, PDF, Keyword +from paperlibrary.api.models import Author, PDF, Keyword, PaperComplete class PaperLibraryAPI: @@ -13,14 +14,35 @@ class PaperLibraryAPI: self.s = Session() self.s.headers.update({"Authorization": "Token " + auth_token}) + def fetch_papers(self) -> List[PaperComplete]: + r = self.s.get(self.baseURL + "papers/") + return PaperComplete.schema().loads(r.text, many=True) + def fetch_authors(self) -> List[Author]: r = self.s.get(self.baseURL + "authors/") return Author.schema().loads(r.text, many=True) - def fetch_keywords(self) -> List[Author]: + def fetch_keywords(self) -> List[Keyword]: r = self.s.get(self.baseURL + "keywords/") return Keyword.schema().loads(r.text, many=True) + def fetch_papers_by_year(self) -> Dict[int, List[PaperComplete]]: + papers = self.fetch_papers() + years: Dict[int, List[PaperComplete]] = {} + for paper in papers: + if paper.year in years: + years[paper.year].append(paper) + else: + years[paper.year] = [paper] + return years + def fetch_pdfs(self) -> List[PDF]: r = self.s.get(self.baseURL + "pdfs/") return PDF.schema().loads(r.text, many=True) + + def upload_pdf(self, pdf, file: Path) -> PDF: + with file.open("rb") as f: + r = self.s.put(pdf.url, files={ + "file": f, + }) + return PDF.schema().loads(r.text) diff --git a/paperlibrary/api/models.py b/paperlibrary/api/models.py index 48facf5..023d3c3 100644 --- a/paperlibrary/api/models.py +++ b/paperlibrary/api/models.py @@ -6,13 +6,23 @@ from dataclasses_json import DataClassJsonMixin, config from marshmallow import fields +@dataclass +class Note(DataClassJsonMixin): + paper: int + recommended_by: List[str] + custom_title: str + notes_md: str + notes_html: str + + @dataclass class PDF(DataClassJsonMixin): id: int + url: str file: str - sha265: str + sha256: str type: str - preview: str + preview: Optional[str] updated_at: datetime = field( metadata=config( encoder=datetime.isoformat, @@ -24,29 +34,52 @@ class PDF(DataClassJsonMixin): @dataclass class Paper(DataClassJsonMixin): - id: int + # id: int url: str title: str pdfs: List[PDF] - doi: str + doi: Optional[str] + note: Optional[Note] @property - def main_pdf(self) -> PDF: + def main_pdf(self) -> Optional[PDF]: + if not self.pdfs: + return None return self.pdfs[0] +@dataclass +class PaperComplete(Paper): + keywords: List[str] + authors: List[str] + first_author: str + publication: str + doctype: str + arxiv_id: str + bibcode: str + year: int + pubdate: str # TODO: to datetime + entry_date: str # TODO: to datetime + citation_count: int + + @dataclass class Author(DataClassJsonMixin): url: str papers: List[Paper] name: str + pretty_name: Optional[str] affiliation: Optional[str] orcid_id: Optional[str] + @property + def display_name(self): + return self.pretty_name if self.pretty_name else self.name + @dataclass class Keyword(DataClassJsonMixin): url: str papers: List[Paper] name: str - schema: str + kw_schema: str diff --git a/paperlibrary/library/library.py b/paperlibrary/library/library.py index 3cde676..8932938 100644 --- a/paperlibrary/library/library.py +++ b/paperlibrary/library/library.py @@ -1,10 +1,10 @@ import hashlib import os import shutil -import string from datetime import datetime from pathlib import Path +from alive_progress import alive_bar from tzlocal import get_localzone from paperlibrary.api import PaperLibraryAPI @@ -12,46 +12,94 @@ from paperlibrary.config import basedir def format_filename(s: str) -> str: - additional_letters = ["ä", "Ä", "ö", "Ö", "ü", "Ü"] - valid_chars = f"-_.() {string.ascii_letters}{string.digits}{''.join(additional_letters)}" - filename = ''.join(c for c in s if c in valid_chars) + invalid_chars = {"/"} + filename = ''.join(c for c in s if c not in invalid_chars) # filename = filename.replace(' ', '_') # I don't like spaces in filenames. + if not filename: + raise ValueError("empty filename") return filename def write_symlinks(api: PaperLibraryAPI): ... pdf_dir = basedir / "pdfs" + pdf_dir.mkdir(exist_ok=True) + author_dir = basedir / "by_author" - shutil.rmtree(author_dir, ignore_errors=True) - author_dir.mkdir() + keyword_dir = basedir / "by_keyword" + year_dir = basedir / "by_year" + title_dir = basedir / "by_title" + custom_title_dir = basedir / "by_custom_title" + + for directory in [author_dir, keyword_dir, year_dir, title_dir, custom_title_dir]: + shutil.rmtree(directory, ignore_errors=True) + directory.mkdir() for author in api.fetch_authors(): - author_subdir = author_dir / format_filename(author.name) + author_subdir = author_dir / format_filename(author.display_name) author_subdir.mkdir() for paper in author.papers: + if not paper.main_pdf: + continue sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" targetfile = author_subdir / "{}.pdf".format(format_filename(paper.title)) targetfile.symlink_to(sourcefile) + for keyword in api.fetch_keywords(): + keyword_subdir = keyword_dir / format_filename(keyword.name) + keyword_subdir.mkdir() + for paper in keyword.papers: + if not paper.main_pdf: + continue + sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" + targetfile = keyword_subdir / "{}.pdf".format(format_filename(paper.title)) + targetfile.symlink_to(sourcefile) + + for paper in api.fetch_papers(): + if not paper.main_pdf: + continue + + sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" + targetfile = title_dir / "{}.pdf".format(format_filename(paper.title)) + targetfile.symlink_to(sourcefile) + + if not paper.note: + continue + sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" + targetfile = custom_title_dir / "{}.pdf".format(format_filename(paper.note.custom_title)) + targetfile.symlink_to(sourcefile) + + for year, papers in api.fetch_papers_by_year().items(): + year_subdir = year_dir / str(year) + year_subdir.mkdir() + for paper in papers: + if not paper.main_pdf: + continue + sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf" + targetfile = year_subdir / "{}.pdf".format(format_filename(paper.title)) + targetfile.symlink_to(sourcefile) + def download_file(api: PaperLibraryAPI, url: str, target_file: Path): r = api.s.get(url) r.raise_for_status() - with target_file.open("wb") as f: - for chunk in r.iter_content(1024): - f.write(chunk) + with alive_bar(int(r.headers["Content-Length"])) as bar: + with target_file.open("wb") as f: + for chunk in r.iter_content(1024): + for _ in range(1024): + bar() + f.write(chunk) def hash_file(file: Path, buffer_size=65536) -> str: - sha265 = hashlib.sha256() + sha256 = hashlib.sha256() with file.open("rb") as f: while True: data = f.read(buffer_size) if not data: break - sha265.update(data) - return sha265.hexdigest() + sha256.update(data) + return sha256.hexdigest() def update_pdfs(api: PaperLibraryAPI): @@ -63,16 +111,14 @@ def update_pdfs(api: PaperLibraryAPI): if not pdf_file.exists(): download_file(api, pdf.file, pdf_file) continue - if hash_file(pdf_file) != pdf.sha265: + if hash_file(pdf_file) != pdf.sha256: modification_date = datetime.fromtimestamp( os.path.getmtime(pdf_file), get_localzone() ) - print(modification_date) - print(pdf.updated_at) - # print(modification_date - pdf.updated_at) if modification_date > pdf.updated_at: - raise ValueError("local file is newer") + print("local file is newer") + api.upload_pdf(pdf, pdf_file) else: - raise ValueError("remote file is newer") - # TODO: check if file should be uploaded or downloaded + print("remote file is newer") + download_file(api, pdf.file, pdf_file) diff --git a/paperlibrary/pap.py b/paperlibrary/pap.py index bc344fa..c1b4326 100644 --- a/paperlibrary/pap.py +++ b/paperlibrary/pap.py @@ -9,12 +9,20 @@ from paperlibrary.library import write_symlinks, update_pdfs def cli(): pass + @cli.command() def update(): - api=PaperLibraryAPI(url,auth_token=auth_token) + api = PaperLibraryAPI(url, auth_token=auth_token) write_symlinks(api) update_pdfs(api) +@cli.command() +def test(): + api = PaperLibraryAPI(url, auth_token=auth_token) + + print(api.fetch_papers()) + + if __name__ == '__main__': cli() diff --git a/poetry.lock b/poetry.lock index efdea81..36e2193 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,3 +1,11 @@ +[[package]] +name = "alive-progress" +version = "1.6.1" +description = "A new kind of Progress Bar, with real-time throughput, eta and very cool animations!" +category = "main" +optional = false +python-versions = ">=2.7, <4" + [[package]] name = "certifi" version = "2020.6.20" @@ -161,9 +169,13 @@ socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "55c1b4123bca6c85380c21f1e6f5faa129bb1171aa2c9a45497e5916e8c4ac2e" +content-hash = "8bb7dbfcc0d218d22ff3dcafae140daff4799bfeb988731a3759412a256d0e29" [metadata.files] +alive-progress = [ + {file = "alive-progress-1.6.1.tar.gz", hash = "sha256:2a0d7516ec0f596d5ce53755c0913a909eb1c91854e1d782e511ef5e1dd53218"}, + {file = "alive_progress-1.6.1-py3-none-any.whl", hash = "sha256:9a0fae6b94fb4e4bcd9fb51760506d29a33358ebbfef2c6516dce3e359a661b5"}, +] certifi = [ {file = "certifi-2020.6.20-py2.py3-none-any.whl", hash = "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"}, {file = "certifi-2020.6.20.tar.gz", hash = "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3"}, diff --git a/pyproject.toml b/pyproject.toml index e64cd18..5eef815 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requests = "^2.24.0" click = "^7.1.2" dataclasses-json = "^0.5.2" tzlocal = "^2.1" +alive-progress = "^1.6.1" [build-system] requires = ["poetry-core>=1.0.0"]