Archived
1
0
Fork 0
This repository has been archived on 2024-06-28. You can view files and clone it, but cannot push or open issues or pull requests.
rss2wallabag/main.py

171 lines
5.2 KiB
Python
Raw Normal View History

2018-01-04 20:25:44 +01:00
import logging
2018-01-03 22:44:58 +01:00
import sys
2020-11-01 21:47:32 +01:00
from dataclasses import dataclass
from datetime import datetime
2016-11-07 16:55:33 +01:00
from time import mktime
2020-11-01 21:47:32 +01:00
from typing import List, Optional, Dict
2018-01-07 14:01:48 +01:00
from urllib.parse import urljoin
2016-11-07 16:55:33 +01:00
2016-11-07 16:17:37 +01:00
import feedparser
2020-11-01 21:47:32 +01:00
import requests
2016-11-07 16:17:37 +01:00
import yaml
2020-11-01 21:47:32 +01:00
from feedparser import FeedParserDict
from api import WallabagAPI
@dataclass
class WallabagConfig:
host: str
client_id: str
client_secret: str
username: str
password: str
class Config:
def __init__(self):
with open("config.yaml", 'r') as stream:
data = yaml.safe_load(stream)
self.wallabag = WallabagConfig(**data["wallabag"])
if data["github_username"]:
self.github_username = data["github_username"]
else:
self.github_username = None
self.debug = data["debug"]
@property
def production(self):
return not self.debug
@dataclass
class Site:
title: str
url: str
github: bool
tags: List[str]
latest_article: Optional[str]
filter: Optional[str]
2020-11-01 21:47:32 +01:00
def load_sites() -> Dict[str, Site]:
with open("sites.yaml", 'r') as stream:
data = yaml.safe_load(stream)
sites: Dict[str, Site] = {}
for title, entry in data.items():
if "latest_article" not in entry:
entry["latest_article"] = None
if "github" not in entry:
entry["github"] = None
if "filter" not in entry:
entry["filter"] = None
2020-11-01 21:47:32 +01:00
sites[title] = Site(title, **entry)
return sites
def get_starred_repos(username, sites: Dict[str, Site]):
r = requests.get("https://api.github.com/users/{user}/starred".format(user=username))
stars = r.json()
for repo in stars:
if repo["full_name"] not in sites:
sites[repo["full_name"]] = Site(
url=repo["html_url"] + "/releases.atom",
tags=["github", repo["name"]],
github=True,
title=repo["full_name"],
latest_article=None
)
return sites
def main():
sites = load_sites()
config = Config()
logger = logging.getLogger()
logger.handlers = []
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(stream=sys.stdout)
ch.setLevel(logging.WARNING if config.production else logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
fh = logging.FileHandler('debug.log')
fh.setFormatter(formatter)
fh.setLevel(logging.WARNING if config.production else logging.DEBUG)
logger.addHandler(fh)
wallabag_config = config.wallabag
api = WallabagAPI(host=wallabag_config.host)
api.auth(client_secret=wallabag_config.client_secret, client_id=wallabag_config.client_id,
username=wallabag_config.username, password=wallabag_config.password)
if config.github_username:
sites = get_starred_repos(config.github_username, sites)
new_sites: Dict[str, Dict] = {}
for title, site in sites.items():
new_site = handle_feed(api, site, logger, config)
new_sites[title] = new_site.__dict__
del new_sites[title]["title"]
if config.production:
with open("sites.yaml", 'w') as stream:
yaml.dump(new_sites, stream, default_flow_style=False)
2018-01-04 20:25:44 +01:00
2020-11-01 21:47:32 +01:00
def handle_feed(api: WallabagAPI, site: Site, logger: logging.Logger, config: Config) -> Site:
logger.info("Downloading feed: " + site.title)
r = api.s.get(site.url)
if r.status_code != 404:
r.raise_for_status()
rss = r.text
logger.info("Parsing feed: " + site.title)
2018-01-04 20:25:44 +01:00
f = feedparser.parse(rss)
2020-11-01 21:47:32 +01:00
logger.debug("finished parsing: " + site.title)
articles: List[FeedParserDict] = f.entries
for article in articles:
if article.title == site.latest_article:
logger.debug("already added: " + article.title)
break
if site.filter and not site.filter in article.title:
logger.debug("article filtered: " + article.title)
continue
2020-11-01 21:47:32 +01:00
logger.info("article found: " + article.title)
taglist = [site.title]
if site.tags:
taglist.extend(site.tags)
if "published_parsed" in article:
published = datetime.fromtimestamp(mktime(article.published_parsed))
elif "updated_parsed" in article:
published = datetime.fromtimestamp(mktime(article.updated_parsed))
else:
published = None
logger.info("add to wallabag: " + article.title)
if site.github:
title = site.title + ": " + article.title
else:
title = article.title
if not hasattr(article, 'link'):
logger.info("no link, skipping!")
continue
url = urljoin(site.url, article.link)
if api.check_exist(url):
logger.info("already found in wallabag: " + article.title)
continue
2020-11-01 22:00:45 +01:00
if config.production:
2020-11-01 21:47:32 +01:00
api.add_entry(url=url, title=title, tags=taglist, published=published)
else:
logger.info("warning: running in debug mode - not adding links to wallabag")
if articles:
site.latest_article = articles[0].title
return site
2016-11-07 16:17:37 +01:00
2018-01-04 20:25:44 +01:00
if __name__ == '__main__':
2020-11-01 21:47:32 +01:00
main()