1
0
Fork 0
mirror of https://github.com/Findus23/se-simulator.git synced 2024-09-19 15:53:45 +02:00

improved importer

This commit is contained in:
Lukas Winkler 2018-03-16 22:52:34 +01:00
parent bb3b691caa
commit 8edc3420cc
3 changed files with 51 additions and 5 deletions

View file

@ -2,14 +2,45 @@ import glob
import os import os
import shutil import shutil
import subprocess import subprocess
from datetime import datetime
from models import *
from parsexml import parse_posts, parse_comments, parse_usernames from parsexml import parse_posts, parse_comments, parse_usernames
from utils import * from utils import *
files = get_files()
# os.chdir("/mydir") # os.chdir("/mydir")
for file in glob.glob("downloads/**/*.7z"): for file in glob.glob("downloads/**/*.7z"):
if not "raspberry" in file: if "meta" in file:
continue continue
code = os.path.basename(os.path.splitext(file)[0]) filename = os.path.basename(file)
code = os.path.splitext(filename)[0]
if filename not in files:
print("{file} doesn't exist on archive.org".format(file=file))
continue
meta = files[filename]
sha1 = file_hash(file)
if sha1 != meta["sha1"]:
print("{file}: hashes don't match".format(file=filename))
continue
alias = Alias.select().where(Alias.url == code)
if len(alias) != 0:
site_id = alias[0].site_id
print(site_id)
site = Site.select().where(Site.id == site_id)[0]
else:
db_element = Site().select().where(Site.url == code)
if len(db_element) == 0:
print("{site} not found in database".format(site=code))
continue
site = db_element[0]
mtime = datetime.fromtimestamp(int(meta["mtime"]))
if site.last_download == mtime:
print("{site} is up to date".format(site=filename))
continue
else:
site.last_download = mtime
site.save()
print(code) print(code)
currentdir = os.getcwd() currentdir = os.getcwd()
rawdir = "raw/" + code rawdir = "raw/" + code

View file

@ -1,11 +1,11 @@
from internetarchive import get_item, download from internetarchive import get_item, download
from models import * from models import *
from utils import *
ignored_se_sites = ["cs50.stackexchange.com"] ignored_se_sites = ["cs50.stackexchange.com"]
ia = get_item("stackexchange") files = get_files()
files = {x["name"]: x for x in ia.files}
for site in Site.select()[1:]: for site in Site.select()[1:]:
if site.url in ignored_se_sites: if site.url in ignored_se_sites:
continue continue
@ -27,5 +27,4 @@ for site in Site.select()[1:]:
if sizeMB < 50: if sizeMB < 50:
print(file) print(file)
print(sizeMB) print(sizeMB)
download("stackexchange", files=file["name"], verbose=True)
exit() exit()

View file

@ -1,8 +1,10 @@
import hashlib
import sys import sys
import resource import resource
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from internetarchive import get_item
def print_stats(i, skipped=None): def print_stats(i, skipped=None):
@ -23,6 +25,20 @@ def html2text(body):
return soup.get_text() return soup.get_text()
def get_files():
ia = get_item("stackexchange")
return {x["name"]: x for x in ia.files}
def file_hash(filename):
"""from https://stackoverflow.com/a/44873382/4398037"""
h = hashlib.sha1()
with open(filename, 'rb', buffering=0) as f:
for b in iter(lambda: f.read(128 * 1024), b''):
h.update(b)
return h.hexdigest()
def get_settings(count): def get_settings(count):
if len(sys.argv) != count + 1: if len(sys.argv) != count + 1:
if count == 1: if count == 1: