mirror of
https://github.com/Findus23/se-simulator.git
synced 2024-09-19 15:53:45 +02:00
improved importer
This commit is contained in:
parent
bb3b691caa
commit
8edc3420cc
3 changed files with 51 additions and 5 deletions
35
consume.py
35
consume.py
|
@ -2,14 +2,45 @@ import glob
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from models import *
|
||||||
from parsexml import parse_posts, parse_comments, parse_usernames
|
from parsexml import parse_posts, parse_comments, parse_usernames
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
|
files = get_files()
|
||||||
# os.chdir("/mydir")
|
# os.chdir("/mydir")
|
||||||
for file in glob.glob("downloads/**/*.7z"):
|
for file in glob.glob("downloads/**/*.7z"):
|
||||||
if not "raspberry" in file:
|
if "meta" in file:
|
||||||
continue
|
continue
|
||||||
code = os.path.basename(os.path.splitext(file)[0])
|
filename = os.path.basename(file)
|
||||||
|
code = os.path.splitext(filename)[0]
|
||||||
|
if filename not in files:
|
||||||
|
print("{file} doesn't exist on archive.org".format(file=file))
|
||||||
|
continue
|
||||||
|
meta = files[filename]
|
||||||
|
sha1 = file_hash(file)
|
||||||
|
if sha1 != meta["sha1"]:
|
||||||
|
print("{file}: hashes don't match".format(file=filename))
|
||||||
|
continue
|
||||||
|
alias = Alias.select().where(Alias.url == code)
|
||||||
|
if len(alias) != 0:
|
||||||
|
site_id = alias[0].site_id
|
||||||
|
print(site_id)
|
||||||
|
site = Site.select().where(Site.id == site_id)[0]
|
||||||
|
else:
|
||||||
|
db_element = Site().select().where(Site.url == code)
|
||||||
|
if len(db_element) == 0:
|
||||||
|
print("{site} not found in database".format(site=code))
|
||||||
|
continue
|
||||||
|
site = db_element[0]
|
||||||
|
mtime = datetime.fromtimestamp(int(meta["mtime"]))
|
||||||
|
if site.last_download == mtime:
|
||||||
|
print("{site} is up to date".format(site=filename))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
site.last_download = mtime
|
||||||
|
site.save()
|
||||||
print(code)
|
print(code)
|
||||||
currentdir = os.getcwd()
|
currentdir = os.getcwd()
|
||||||
rawdir = "raw/" + code
|
rawdir = "raw/" + code
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from internetarchive import get_item, download
|
from internetarchive import get_item, download
|
||||||
|
|
||||||
from models import *
|
from models import *
|
||||||
|
from utils import *
|
||||||
|
|
||||||
ignored_se_sites = ["cs50.stackexchange.com"]
|
ignored_se_sites = ["cs50.stackexchange.com"]
|
||||||
|
|
||||||
ia = get_item("stackexchange")
|
files = get_files()
|
||||||
files = {x["name"]: x for x in ia.files}
|
|
||||||
for site in Site.select()[1:]:
|
for site in Site.select()[1:]:
|
||||||
if site.url in ignored_se_sites:
|
if site.url in ignored_se_sites:
|
||||||
continue
|
continue
|
||||||
|
@ -27,5 +27,4 @@ for site in Site.select()[1:]:
|
||||||
if sizeMB < 50:
|
if sizeMB < 50:
|
||||||
print(file)
|
print(file)
|
||||||
print(sizeMB)
|
print(sizeMB)
|
||||||
download("stackexchange", files=file["name"], verbose=True)
|
|
||||||
exit()
|
exit()
|
||||||
|
|
16
utils.py
16
utils.py
|
@ -1,8 +1,10 @@
|
||||||
|
import hashlib
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import resource
|
import resource
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from internetarchive import get_item
|
||||||
|
|
||||||
|
|
||||||
def print_stats(i, skipped=None):
|
def print_stats(i, skipped=None):
|
||||||
|
@ -23,6 +25,20 @@ def html2text(body):
|
||||||
return soup.get_text()
|
return soup.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
def get_files():
|
||||||
|
ia = get_item("stackexchange")
|
||||||
|
return {x["name"]: x for x in ia.files}
|
||||||
|
|
||||||
|
|
||||||
|
def file_hash(filename):
|
||||||
|
"""from https://stackoverflow.com/a/44873382/4398037"""
|
||||||
|
h = hashlib.sha1()
|
||||||
|
with open(filename, 'rb', buffering=0) as f:
|
||||||
|
for b in iter(lambda: f.read(128 * 1024), b''):
|
||||||
|
h.update(b)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def get_settings(count):
|
def get_settings(count):
|
||||||
if len(sys.argv) != count + 1:
|
if len(sys.argv) != count + 1:
|
||||||
if count == 1:
|
if count == 1:
|
||||||
|
|
Loading…
Reference in a new issue