mirror of
https://github.com/Findus23/se-simulator.git
synced 2024-09-19 15:53:45 +02:00
database storage
This commit is contained in:
parent
8edc3420cc
commit
ad0a671cba
5 changed files with 108 additions and 16 deletions
|
@ -1,12 +1,15 @@
|
|||
import html
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from models import *
|
||||
|
||||
# for i in [Alias, Site]:
|
||||
# mdls = [Question, Answer, Title, User, Alias, Site]
|
||||
# for i in mdls:
|
||||
# i.drop_table()
|
||||
# for i in [Site, Alias]:
|
||||
# for i in reversed(mdls):
|
||||
# print(i)
|
||||
# i.create_table()
|
||||
|
||||
r = requests.get("https://api.stackexchange.com/2.2/sites?pagesize=500")
|
||||
|
|
30
models.py
30
models.py
|
@ -12,9 +12,37 @@ class Site(BaseModel):
|
|||
tag_background_color = CharField(max_length=7)
|
||||
tag_foreground_color = CharField(max_length=7)
|
||||
link_color = CharField(max_length=7)
|
||||
enabled=BooleanField(default=True)
|
||||
enabled = BooleanField(default=True)
|
||||
|
||||
|
||||
class Alias(BaseModel):
|
||||
site = ForeignKeyField(Site)
|
||||
url = CharField(unique=True, max_length=50)
|
||||
|
||||
|
||||
class Title(BaseModel):
|
||||
text = CharField()
|
||||
slug = CharField()
|
||||
site = ForeignKeyField(Site)
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
username = CharField()
|
||||
site = ForeignKeyField(Site)
|
||||
|
||||
|
||||
class Question(BaseModel):
|
||||
text = TextField()
|
||||
upvotes = IntegerField(default=0)
|
||||
downvotes = IntegerField(default=0)
|
||||
title = ForeignKeyField(Title)
|
||||
user = ForeignKeyField(User)
|
||||
site = ForeignKeyField(Site)
|
||||
|
||||
|
||||
class Answer(BaseModel):
|
||||
text = TextField()
|
||||
upvotes = IntegerField()
|
||||
downvotes = IntegerField()
|
||||
user = ForeignKeyField(User)
|
||||
site = ForeignKeyField(Site)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
|
||||
import jsonlines
|
||||
import markovify
|
||||
import os
|
||||
from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer
|
||||
|
||||
from markov import MarkovText, MarkovUserName
|
||||
from utils import *
|
||||
|
@ -26,12 +26,12 @@ def load_chain(chainfile, mode):
|
|||
return markov.from_json(data)
|
||||
|
||||
|
||||
def generate_chain(basedir, mode):
|
||||
def generate_chain(sourcedir, chainfile, mode):
|
||||
combined_cains = None
|
||||
chainlist = []
|
||||
markov = get_markov(mode)
|
||||
i = 0
|
||||
with jsonlines.open(basedir + "/{type}.jsonl".format(type=mode), mode="r") as content:
|
||||
with jsonlines.open(sourcedir + "/{type}.jsonl".format(type=mode), mode="r") as content:
|
||||
for text in content:
|
||||
text = text.strip()
|
||||
try:
|
||||
|
@ -57,17 +57,28 @@ def generate_chain(basedir, mode):
|
|||
return chain
|
||||
|
||||
|
||||
def get_chain(url, mode):
|
||||
sourcedir = 'sites/{url}'.format(url=url, type=mode)
|
||||
chainfile = 'sites/{url}/{type}.chain.json'.format(url=url, type=mode)
|
||||
if os.path.exists(chainfile):
|
||||
return load_chain(chainfile, mode)
|
||||
else:
|
||||
return generate_chain(sourcedir, chainfile, mode)
|
||||
|
||||
|
||||
def generate_text(chain: markovify.Text, model):
|
||||
if model == "Titles":
|
||||
return chain.make_short_sentence(70)
|
||||
else:
|
||||
return chain.make_sentence()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
basedir, mode = get_settings(2)
|
||||
if mode not in ["Questions", "Answers", "Titles", "Usernames"]:
|
||||
print("error")
|
||||
exit()
|
||||
chainfile = basedir + '/{type}.chain.json'.format(type=mode)
|
||||
if os.path.exists(chainfile):
|
||||
chain = load_chain(chainfile, mode)
|
||||
else:
|
||||
chain = generate_chain(basedir, mode)
|
||||
|
||||
chain = get_chain("sites/astronomy.stackexchange.com", mode)
|
||||
for _ in range(10):
|
||||
# walk = []
|
||||
# for text in chain.gen():
|
||||
|
|
45
todb.py
Normal file
45
todb.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
from slugify import slugify
|
||||
|
||||
from models import *
|
||||
from text_generator import get_chain, generate_text
|
||||
|
||||
|
||||
def add_username(site, count=100):
|
||||
"""
|
||||
|
||||
:type site: Site
|
||||
"""
|
||||
chain = get_chain(site.url, "Usernames")
|
||||
for _ in range(count):
|
||||
username = generate_text(chain, "Usernames")
|
||||
User.create(username=username, site=site)
|
||||
|
||||
|
||||
def add_title(site, count=100):
|
||||
chain = get_chain(site.url, "Titles")
|
||||
for _ in range(count):
|
||||
title = generate_text(chain, "Titles")
|
||||
slug = slugify(title, max_length=70, word_boundary=True)
|
||||
Title.create(text=title, slug=slug, site=site)
|
||||
|
||||
|
||||
def add_question(site, count=100):
|
||||
users = User.select().where(User.site == site).limit(count)
|
||||
titles = Title.select().where(Title.site == site).limit(count)
|
||||
chain = get_chain(site.url, "Questions")
|
||||
|
||||
for i in range(count):
|
||||
text = generate_text(chain, "Questions")
|
||||
title = titles[i]
|
||||
print(title.text)
|
||||
user = users[i]
|
||||
print(user.username)
|
||||
Question.create(text=text, title_id=title, user_id=user, site_id=site)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
query = Site.select().where(Site.last_download.is_null(False)).limit(1)
|
||||
s = query.get()
|
||||
add_username(s)
|
||||
add_title(s)
|
||||
add_question(s)
|
9
utils.py
9
utils.py
|
@ -1,7 +1,8 @@
|
|||
import hashlib
|
||||
import sys
|
||||
|
||||
import random
|
||||
import resource
|
||||
import string
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from internetarchive import get_item
|
||||
|
@ -51,3 +52,7 @@ def get_settings(count):
|
|||
return sys.argv[1]
|
||||
elif count == 2:
|
||||
return sys.argv[1], sys.argv[2]
|
||||
|
||||
|
||||
def get_random_string(length):
|
||||
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
|
||||
|
|
Loading…
Reference in a new issue