84982fc5b3
This allows me to only add articles where title contain certain string.
170 lines
5.2 KiB
Python
170 lines
5.2 KiB
Python
import logging
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from time import mktime
|
|
from typing import List, Optional, Dict
|
|
from urllib.parse import urljoin
|
|
|
|
import feedparser
|
|
import requests
|
|
import yaml
|
|
from feedparser import FeedParserDict
|
|
|
|
from api import WallabagAPI
|
|
|
|
|
|
@dataclass
|
|
class WallabagConfig:
|
|
host: str
|
|
client_id: str
|
|
client_secret: str
|
|
username: str
|
|
password: str
|
|
|
|
|
|
class Config:
|
|
def __init__(self):
|
|
with open("config.yaml", 'r') as stream:
|
|
data = yaml.safe_load(stream)
|
|
self.wallabag = WallabagConfig(**data["wallabag"])
|
|
if data["github_username"]:
|
|
self.github_username = data["github_username"]
|
|
else:
|
|
self.github_username = None
|
|
self.debug = data["debug"]
|
|
|
|
@property
|
|
def production(self):
|
|
return not self.debug
|
|
|
|
|
|
@dataclass
|
|
class Site:
|
|
title: str
|
|
url: str
|
|
github: bool
|
|
tags: List[str]
|
|
latest_article: Optional[str]
|
|
filter: Optional[str]
|
|
|
|
|
|
def load_sites() -> Dict[str, Site]:
|
|
with open("sites.yaml", 'r') as stream:
|
|
data = yaml.safe_load(stream)
|
|
sites: Dict[str, Site] = {}
|
|
for title, entry in data.items():
|
|
if "latest_article" not in entry:
|
|
entry["latest_article"] = None
|
|
if "github" not in entry:
|
|
entry["github"] = None
|
|
if "filter" not in entry:
|
|
entry["filter"] = None
|
|
sites[title] = Site(title, **entry)
|
|
return sites
|
|
|
|
|
|
def get_starred_repos(username, sites: Dict[str, Site]):
|
|
r = requests.get("https://api.github.com/users/{user}/starred".format(user=username))
|
|
stars = r.json()
|
|
for repo in stars:
|
|
if repo["full_name"] not in sites:
|
|
sites[repo["full_name"]] = Site(
|
|
url=repo["html_url"] + "/releases.atom",
|
|
tags=["github", repo["name"]],
|
|
github=True,
|
|
title=repo["full_name"],
|
|
latest_article=None
|
|
)
|
|
return sites
|
|
|
|
|
|
def main():
|
|
sites = load_sites()
|
|
config = Config()
|
|
|
|
logger = logging.getLogger()
|
|
logger.handlers = []
|
|
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
ch = logging.StreamHandler(stream=sys.stdout)
|
|
ch.setLevel(logging.WARNING if config.production else logging.DEBUG)
|
|
ch.setFormatter(formatter)
|
|
logger.addHandler(ch)
|
|
|
|
fh = logging.FileHandler('debug.log')
|
|
fh.setFormatter(formatter)
|
|
fh.setLevel(logging.WARNING if config.production else logging.DEBUG)
|
|
logger.addHandler(fh)
|
|
|
|
wallabag_config = config.wallabag
|
|
api = WallabagAPI(host=wallabag_config.host)
|
|
api.auth(client_secret=wallabag_config.client_secret, client_id=wallabag_config.client_id,
|
|
username=wallabag_config.username, password=wallabag_config.password)
|
|
|
|
if config.github_username:
|
|
sites = get_starred_repos(config.github_username, sites)
|
|
|
|
new_sites: Dict[str, Dict] = {}
|
|
for title, site in sites.items():
|
|
new_site = handle_feed(api, site, logger, config)
|
|
new_sites[title] = new_site.__dict__
|
|
del new_sites[title]["title"]
|
|
if config.production:
|
|
with open("sites.yaml", 'w') as stream:
|
|
yaml.dump(new_sites, stream, default_flow_style=False)
|
|
|
|
|
|
def handle_feed(api: WallabagAPI, site: Site, logger: logging.Logger, config: Config) -> Site:
|
|
logger.info("Downloading feed: " + site.title)
|
|
r = api.s.get(site.url)
|
|
if r.status_code != 404:
|
|
r.raise_for_status()
|
|
rss = r.text
|
|
logger.info("Parsing feed: " + site.title)
|
|
f = feedparser.parse(rss)
|
|
logger.debug("finished parsing: " + site.title)
|
|
|
|
articles: List[FeedParserDict] = f.entries
|
|
for article in articles:
|
|
if article.title == site.latest_article:
|
|
logger.debug("already added: " + article.title)
|
|
break
|
|
if site.filter and not site.filter in article.title:
|
|
logger.debug("article filtered: " + article.title)
|
|
continue
|
|
logger.info("article found: " + article.title)
|
|
taglist = [site.title]
|
|
if site.tags:
|
|
taglist.extend(site.tags)
|
|
if "published_parsed" in article:
|
|
published = datetime.fromtimestamp(mktime(article.published_parsed))
|
|
elif "updated_parsed" in article:
|
|
published = datetime.fromtimestamp(mktime(article.updated_parsed))
|
|
else:
|
|
published = None
|
|
logger.info("add to wallabag: " + article.title)
|
|
if site.github:
|
|
title = site.title + ": " + article.title
|
|
else:
|
|
title = article.title
|
|
if not hasattr(article, 'link'):
|
|
logger.info("no link, skipping!")
|
|
continue
|
|
url = urljoin(site.url, article.link)
|
|
if api.check_exist(url):
|
|
logger.info("already found in wallabag: " + article.title)
|
|
continue
|
|
if config.production:
|
|
api.add_entry(url=url, title=title, tags=taglist, published=published)
|
|
else:
|
|
logger.info("warning: running in debug mode - not adding links to wallabag")
|
|
if articles:
|
|
site.latest_article = articles[0].title
|
|
|
|
return site
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|