mirror of
https://github.com/Findus23/se-simulator.git
synced 2024-09-19 15:53:45 +02:00
improved importer
This commit is contained in:
parent
bb3b691caa
commit
8edc3420cc
3 changed files with 51 additions and 5 deletions
35
consume.py
35
consume.py
|
@ -2,14 +2,45 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
|
||||
from models import *
|
||||
from parsexml import parse_posts, parse_comments, parse_usernames
|
||||
from utils import *
|
||||
|
||||
files = get_files()
|
||||
# os.chdir("/mydir")
|
||||
for file in glob.glob("downloads/**/*.7z"):
|
||||
if not "raspberry" in file:
|
||||
if "meta" in file:
|
||||
continue
|
||||
code = os.path.basename(os.path.splitext(file)[0])
|
||||
filename = os.path.basename(file)
|
||||
code = os.path.splitext(filename)[0]
|
||||
if filename not in files:
|
||||
print("{file} doesn't exist on archive.org".format(file=file))
|
||||
continue
|
||||
meta = files[filename]
|
||||
sha1 = file_hash(file)
|
||||
if sha1 != meta["sha1"]:
|
||||
print("{file}: hashes don't match".format(file=filename))
|
||||
continue
|
||||
alias = Alias.select().where(Alias.url == code)
|
||||
if len(alias) != 0:
|
||||
site_id = alias[0].site_id
|
||||
print(site_id)
|
||||
site = Site.select().where(Site.id == site_id)[0]
|
||||
else:
|
||||
db_element = Site().select().where(Site.url == code)
|
||||
if len(db_element) == 0:
|
||||
print("{site} not found in database".format(site=code))
|
||||
continue
|
||||
site = db_element[0]
|
||||
mtime = datetime.fromtimestamp(int(meta["mtime"]))
|
||||
if site.last_download == mtime:
|
||||
print("{site} is up to date".format(site=filename))
|
||||
continue
|
||||
else:
|
||||
site.last_download = mtime
|
||||
site.save()
|
||||
print(code)
|
||||
currentdir = os.getcwd()
|
||||
rawdir = "raw/" + code
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from internetarchive import get_item, download
|
||||
|
||||
from models import *
|
||||
from utils import *
|
||||
|
||||
ignored_se_sites = ["cs50.stackexchange.com"]
|
||||
|
||||
ia = get_item("stackexchange")
|
||||
files = {x["name"]: x for x in ia.files}
|
||||
files = get_files()
|
||||
for site in Site.select()[1:]:
|
||||
if site.url in ignored_se_sites:
|
||||
continue
|
||||
|
@ -27,5 +27,4 @@ for site in Site.select()[1:]:
|
|||
if sizeMB < 50:
|
||||
print(file)
|
||||
print(sizeMB)
|
||||
download("stackexchange", files=file["name"], verbose=True)
|
||||
exit()
|
||||
|
|
16
utils.py
16
utils.py
|
@ -1,8 +1,10 @@
|
|||
import hashlib
|
||||
import sys
|
||||
|
||||
import resource
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from internetarchive import get_item
|
||||
|
||||
|
||||
def print_stats(i, skipped=None):
|
||||
|
@ -23,6 +25,20 @@ def html2text(body):
|
|||
return soup.get_text()
|
||||
|
||||
|
||||
def get_files():
|
||||
ia = get_item("stackexchange")
|
||||
return {x["name"]: x for x in ia.files}
|
||||
|
||||
|
||||
def file_hash(filename):
|
||||
"""from https://stackoverflow.com/a/44873382/4398037"""
|
||||
h = hashlib.sha1()
|
||||
with open(filename, 'rb', buffering=0) as f:
|
||||
for b in iter(lambda: f.read(128 * 1024), b''):
|
||||
h.update(b)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def get_settings(count):
|
||||
if len(sys.argv) != count + 1:
|
||||
if count == 1:
|
||||
|
|
Loading…
Reference in a new issue