improved importer

2024-09-19 15:53:45 +02:00 · 2018-03-16 22:52:34 +01:00 · 2018-03-16 22:52:34 +01:00 · 8edc3420cc
commit 8edc3420cc
parent bb3b691caa
3 changed files with 51 additions and 5 deletions
--- a/consume.py
+++ b/consume.py
@ -2,14 +2,45 @@ import glob
 import os
 import shutil
 import subprocess
+from datetime import datetime

+from models import *
 from parsexml import parse_posts, parse_comments, parse_usernames
 from utils import *
+
+files = get_files()
 # os.chdir("/mydir")
 for file in glob.glob("downloads/**/*.7z"):
-    if not "raspberry" in file:
+    if "meta" in file:
        continue
-    code = os.path.basename(os.path.splitext(file)[0])
+    filename = os.path.basename(file)
+    code = os.path.splitext(filename)[0]
+    if filename not in files:
+        print("{file} doesn't exist on archive.org".format(file=file))
+        continue
+    meta = files[filename]
+    sha1 = file_hash(file)
+    if sha1 != meta["sha1"]:
+        print("{file}: hashes don't match".format(file=filename))
+        continue
+    alias = Alias.select().where(Alias.url == code)
+    if len(alias) != 0:
+        site_id = alias[0].site_id
+        print(site_id)
+        site = Site.select().where(Site.id == site_id)[0]
+    else:
+        db_element = Site().select().where(Site.url == code)
+        if len(db_element) == 0:
+            print("{site} not found in database".format(site=code))
+            continue
+        site = db_element[0]
+    mtime = datetime.fromtimestamp(int(meta["mtime"]))
+    if site.last_download == mtime:
+        print("{site} is up to date".format(site=filename))
+        continue
+    else:
+        site.last_download = mtime
+        site.save()
    print(code)
    currentdir = os.getcwd()
    rawdir = "raw/" + code
--- a/updater.py
+++ b/updater.py
@ -1,11 +1,11 @@
 from internetarchive import get_item, download

 from models import *
+from utils import *

 ignored_se_sites = ["cs50.stackexchange.com"]

-ia = get_item("stackexchange")
-files = {x["name"]: x for x in ia.files}
+files = get_files()
 for site in Site.select()[1:]:
    if site.url in ignored_se_sites:
        continue
@ -27,5 +27,4 @@ for site in Site.select()[1:]:
        if sizeMB < 50:
            print(file)
            print(sizeMB)
-            download("stackexchange", files=file["name"], verbose=True)
            exit()
--- a/utils.py
+++ b/utils.py
@ -1,8 +1,10 @@
+import hashlib
 import sys

 import resource

 from bs4 import BeautifulSoup
+from internetarchive import get_item


 def print_stats(i, skipped=None):
@ -23,6 +25,20 @@ def html2text(body):
    return soup.get_text()


+def get_files():
+    ia = get_item("stackexchange")
+    return {x["name"]: x for x in ia.files}
+
+
+def file_hash(filename):
+    """from https://stackoverflow.com/a/44873382/4398037"""
+    h = hashlib.sha1()
+    with open(filename, 'rb', buffering=0) as f:
+        for b in iter(lambda: f.read(128 * 1024), b''):
+            h.update(b)
+    return h.hexdigest()
+
+
 def get_settings(count):
    if len(sys.argv) != count + 1:
        if count == 1: