1
0
Fork 0
mirror of https://github.com/Findus23/PaperLibrary-cli.git synced 2024-09-20 17:03:46 +02:00
PaperLibrary-cli/paperlibrary/library/library.py

210 lines
7 KiB
Python
Raw Normal View History

2020-10-19 17:43:08 +02:00
import hashlib
import os
import shutil
2023-06-27 22:38:59 +02:00
from datetime import datetime, timezone
2020-10-19 17:43:08 +02:00
from pathlib import Path
2020-11-01 13:48:40 +01:00
from alive_progress import alive_bar
2020-10-19 17:43:08 +02:00
from tzlocal import get_localzone
from paperlibrary.api import PaperLibraryAPI
2020-11-08 22:25:56 +01:00
from paperlibrary.api.models import Paper
2020-12-29 17:16:16 +01:00
from paperlibrary.config import Config
2020-10-19 17:43:08 +02:00
def format_filename(s: str) -> str:
2020-11-01 13:48:40 +01:00
invalid_chars = {"/"}
filename = ''.join(c for c in s if c not in invalid_chars)
2020-10-19 17:43:08 +02:00
# filename = filename.replace(' ', '_') # I don't like spaces in filenames.
2020-11-01 13:48:40 +01:00
if not filename:
raise ValueError("empty filename")
2020-10-19 17:43:08 +02:00
return filename
2020-11-08 22:25:56 +01:00
def link_file(pdf_dir: Path, directory: Path, paper: Paper, filename: str = None) -> None:
if not paper.main_pdf:
return
if filename is None:
filename = paper.title
2023-06-27 22:38:59 +02:00
notes_dir = pdf_dir.parent / "notes"
notes_file = notes_dir / f"{paper.id}.md"
meta_dir = pdf_dir.parent / "meta"
meta_file = meta_dir / f"{paper.id}.json"
2020-11-08 22:25:56 +01:00
sourcefile = pdf_dir / f"{paper.main_pdf.id}.pdf"
targetfile = directory / "{}.pdf".format(format_filename(filename))
targetfile.symlink_to(sourcefile)
2023-06-27 22:38:59 +02:00
targetfile.with_suffix(".md").symlink_to(notes_file)
targetfile.with_suffix(".json").symlink_to(meta_file)
2020-11-08 22:25:56 +01:00
2020-12-29 17:16:16 +01:00
def write_symlinks(api: PaperLibraryAPI, config: Config):
basedir = config.basedir_path
2020-10-19 17:43:08 +02:00
pdf_dir = basedir / "pdfs"
2020-11-01 13:48:40 +01:00
pdf_dir.mkdir(exist_ok=True)
2020-10-19 17:43:08 +02:00
author_dir = basedir / "by_author"
2020-11-01 13:48:40 +01:00
keyword_dir = basedir / "by_keyword"
year_dir = basedir / "by_year"
2023-06-27 22:38:59 +02:00
tags_dir = basedir / "by_tags"
2020-11-01 13:48:40 +01:00
title_dir = basedir / "by_title"
custom_title_dir = basedir / "by_custom_title"
2020-11-08 22:25:56 +01:00
citation_key_dir = basedir / "by_citation_key"
2024-01-16 21:38:06 +01:00
citename_dir = basedir / "by_citename"
2020-11-01 13:48:40 +01:00
2023-12-11 18:39:51 +01:00
tags = set()
2024-01-16 21:38:06 +01:00
for directory in [author_dir, keyword_dir, year_dir, title_dir, tags_dir, custom_title_dir, citation_key_dir,
citename_dir]:
2020-11-01 13:48:40 +01:00
shutil.rmtree(directory, ignore_errors=True)
directory.mkdir()
2020-10-19 17:43:08 +02:00
for author in api.fetch_authors():
2020-12-29 17:16:16 +01:00
if not author.papers:
continue
2020-11-01 13:48:40 +01:00
author_subdir = author_dir / format_filename(author.display_name)
2020-10-19 17:43:08 +02:00
author_subdir.mkdir()
for paper in author.papers:
2020-11-08 22:25:56 +01:00
link_file(pdf_dir, author_subdir, paper)
2020-10-19 17:43:08 +02:00
2020-11-01 13:48:40 +01:00
for keyword in api.fetch_keywords():
2020-12-29 17:16:16 +01:00
if not keyword.papers:
continue
2020-11-01 13:48:40 +01:00
keyword_subdir = keyword_dir / format_filename(keyword.name)
2023-06-27 22:38:59 +02:00
keyword_subdir.mkdir(exist_ok=True)
2020-11-01 13:48:40 +01:00
for paper in keyword.papers:
2020-11-08 22:25:56 +01:00
link_file(pdf_dir, keyword_subdir, paper)
2020-11-01 13:48:40 +01:00
for paper in api.fetch_papers():
2020-11-08 22:25:56 +01:00
link_file(pdf_dir, title_dir, paper, paper.title)
2023-06-27 22:38:59 +02:00
for tag in paper.tags:
tag_dir = tags_dir / tag
tag_dir.mkdir(exist_ok=True, parents=True)
link_file(pdf_dir, tag_dir, paper, paper.title)
2023-08-08 11:04:42 +02:00
tags.add(tag)
2020-11-01 13:48:40 +01:00
2024-01-16 21:38:06 +01:00
if paper.custom_title:
link_file(pdf_dir, custom_title_dir, paper, paper.custom_title)
2020-11-01 13:48:40 +01:00
2024-01-16 21:38:06 +01:00
if paper.citation_key:
link_file(pdf_dir, citation_key_dir, paper, paper.citation_key)
if paper.citename:
link_file(pdf_dir, citename_dir, paper, paper.citename)
2020-11-01 13:48:40 +01:00
for year, papers in api.fetch_papers_by_year().items():
year_subdir = year_dir / str(year)
year_subdir.mkdir()
for paper in papers:
2020-11-08 22:25:56 +01:00
link_file(pdf_dir, year_subdir, paper)
2020-11-01 13:48:40 +01:00
2023-08-08 11:04:42 +02:00
for tag in tags:
write_bibliography(api, config, tag)
2020-10-19 17:43:08 +02:00
def download_file(api: PaperLibraryAPI, url: str, target_file: Path):
2024-04-17 15:52:45 +02:00
print("downloading", url, end=" ", flush=True)
2020-10-19 17:43:08 +02:00
r = api.s.get(url)
r.raise_for_status()
2024-04-17 15:52:45 +02:00
file_size = int(r.headers["Content-Length"])
if file_size > 30 * 1024 * 1024:
with alive_bar(int(r.headers["Content-Length"])) as bar:
with target_file.open("wb") as f:
for chunk in r.iter_content(1024):
bar(1024)
f.write(chunk)
else:
2020-11-01 13:48:40 +01:00
with target_file.open("wb") as f:
2024-04-17 15:52:45 +02:00
f.write(r.content)
print("done")
2020-10-19 17:43:08 +02:00
def hash_file(file: Path, buffer_size=65536) -> str:
2020-11-01 13:48:40 +01:00
sha256 = hashlib.sha256()
2020-10-19 17:43:08 +02:00
with file.open("rb") as f:
while True:
data = f.read(buffer_size)
if not data:
break
2020-11-01 13:48:40 +01:00
sha256.update(data)
return sha256.hexdigest()
2020-10-19 17:43:08 +02:00
2020-12-29 17:16:16 +01:00
def update_pdfs(api: PaperLibraryAPI, config: Config):
pdf_dir = config.basedir_path / "pdfs"
2020-10-19 17:43:08 +02:00
pdf_dir.mkdir(exist_ok=True)
for pdf in api.fetch_pdfs():
pdf_file = pdf_dir / f"{pdf.id}.pdf"
if not pdf_file.exists():
download_file(api, pdf.file, pdf_file)
continue
2020-11-01 13:48:40 +01:00
if hash_file(pdf_file) != pdf.sha256:
2020-10-19 17:43:08 +02:00
modification_date = datetime.fromtimestamp(
os.path.getmtime(pdf_file),
get_localzone()
)
if modification_date > pdf.updated_at:
2020-11-01 13:48:40 +01:00
print("local file is newer")
api.upload_pdf(pdf, pdf_file)
2020-10-19 17:43:08 +02:00
else:
2020-11-01 13:48:40 +01:00
print("remote file is newer")
download_file(api, pdf.file, pdf_file)
2020-11-08 22:25:56 +01:00
2023-06-27 22:38:59 +02:00
def update_meta(api: PaperLibraryAPI, config: Config):
meta_dir = config.basedir_path / "meta"
meta_dir.mkdir(exist_ok=True)
for paper in api.fetch_papers():
2024-04-17 16:01:18 +02:00
meta_file = meta_dir / f"{paper.id}.json"
if meta_file.exists():
meta_file.chmod(0o644)
with meta_file.open("w") as f:
2023-06-27 22:38:59 +02:00
f.write(paper.to_json(indent=2, ensure_ascii=False))
2024-04-17 16:01:18 +02:00
meta_file.chmod(0o444)
2023-06-27 22:38:59 +02:00
def update_notes(api: PaperLibraryAPI, config: Config):
notes_dir = config.basedir_path / "notes"
notes_dir.mkdir(exist_ok=True)
for paper in api.fetch_papers():
if paper.notes_md is None:
paper.notes_md = ""
notes_file = notes_dir / f"{paper.id}.md"
if not notes_file.exists():
notes_file.write_text(paper.notes_md)
continue
file_text = notes_file.read_text()
if file_text == paper.notes_md:
continue
print(repr(file_text), repr(paper.notes_md))
if paper.notes_updated_at is None:
api.create_note(paper.id, file_text)
continue
online_change_date = datetime.fromisoformat(paper.notes_updated_at)
local_change_date = datetime.fromtimestamp(notes_file.stat().st_mtime, tz=timezone.utc)
print(online_change_date, local_change_date)
print(local_change_date - online_change_date)
if online_change_date > local_change_date:
print("fetching from online")
notes_file.write_text(paper.notes_md)
continue
print("updating online")
api.update_note(paper.id, file_text)
2023-08-08 11:04:42 +02:00
def write_bibliography(api: PaperLibraryAPI, config: Config, tag: str = None):
tags_dir = config.basedir_path / "by_tags"
if tag:
dir = tags_dir / tag
else:
dir = config.basedir_path
bib = api.fetch_bibliography(tag)
target_file = dir / "bibliography.bib"
2023-08-08 11:11:40 +02:00
if target_file.exists():
target_file.chmod(0o644)
2020-11-08 22:25:56 +01:00
with target_file.open("w") as f:
f.write(bib)
2023-08-08 11:11:40 +02:00
target_file.chmod(0o444)