improved importer

2024-09-19 15:53:45 +02:00 · 2018-03-16 22:52:34 +01:00 · 2018-03-16 22:52:34 +01:00 · 8edc3420cc
commit 8edc3420cc
parent bb3b691caa
3 changed files with 51 additions and 5 deletions
--- a/consume.py
+++ b/consume.py
@ -2,14 +2,45 @@ import glob
 import os
 import shutil
 import subprocess
 from datetime import datetime
 from models import *
 from parsexml import parse_posts, parse_comments, parse_usernames
 from utils import *
 files = get_files()
 # os.chdir("/mydir")
 for file in glob.glob("downloads/**/*.7z"):
-    if not "raspberry" in file:
+    if "meta" in file:
        continue
-    code = os.path.basename(os.path.splitext(file)[0])
+    filename = os.path.basename(file)
    code = os.path.splitext(filename)[0]
    if filename not in files:
        print("{file} doesn't exist on archive.org".format(file=file))
        continue
    meta = files[filename]
    sha1 = file_hash(file)
    if sha1 != meta["sha1"]:
        print("{file}: hashes don't match".format(file=filename))
        continue
    alias = Alias.select().where(Alias.url == code)
    if len(alias) != 0:
        site_id = alias[0].site_id
        print(site_id)
        site = Site.select().where(Site.id == site_id)[0]
    else:
        db_element = Site().select().where(Site.url == code)
        if len(db_element) == 0:
            print("{site} not found in database".format(site=code))
            continue
        site = db_element[0]
    mtime = datetime.fromtimestamp(int(meta["mtime"]))
    if site.last_download == mtime:
        print("{site} is up to date".format(site=filename))
        continue
    else:
        site.last_download = mtime
        site.save()
    print(code)
    currentdir = os.getcwd()
    rawdir = "raw/" + code
--- a/updater.py
+++ b/updater.py
@ -1,11 +1,11 @@
 from internetarchive import get_item, download
 from models import *
 from utils import *
 ignored_se_sites = ["cs50.stackexchange.com"]
-ia = get_item("stackexchange")
+files = get_files()
 files = {x["name"]: x for x in ia.files}
 for site in Site.select()[1:]:
    if site.url in ignored_se_sites:
        continue
@ -27,5 +27,4 @@ for site in Site.select()[1:]:
        if sizeMB < 50:
            print(file)
            print(sizeMB)
            download("stackexchange", files=file["name"], verbose=True)
            exit()
--- a/utils.py
+++ b/utils.py
@ -1,8 +1,10 @@
 import hashlib
 import sys
 import resource
 from bs4 import BeautifulSoup
 from internetarchive import get_item
 def print_stats(i, skipped=None):
@ -23,6 +25,20 @@ def html2text(body):
    return soup.get_text()
 def get_files():
    ia = get_item("stackexchange")
    return {x["name"]: x for x in ia.files}
 def file_hash(filename):
    """from https://stackoverflow.com/a/44873382/4398037"""
    h = hashlib.sha1()
    with open(filename, 'rb', buffering=0) as f:
        for b in iter(lambda: f.read(128 * 1024), b''):
            h.update(b)
    return h.hexdigest()
 def get_settings(count):
    if len(sys.argv) != count + 1:
        if count == 1: