1
0
Fork 0
mirror of https://github.com/Findus23/se-simulator.git synced 2024-09-19 15:53:45 +02:00

improve generator

This commit is contained in:
Lukas Winkler 2018-03-16 20:31:43 +01:00
parent 6dd5438973
commit bb3b691caa
5 changed files with 64 additions and 32 deletions

View file

@ -7,7 +7,7 @@ from parsexml import parse_posts, parse_comments, parse_usernames
from utils import * from utils import *
# os.chdir("/mydir") # os.chdir("/mydir")
for file in glob.glob("downloads/**/*.7z"): for file in glob.glob("downloads/**/*.7z"):
if not "worldbuilding" in file: if not "raspberry" in file:
continue continue
code = os.path.basename(os.path.splitext(file)[0]) code = os.path.basename(os.path.splitext(file)[0])
print(code) print(code)

View file

@ -5,9 +5,17 @@ tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer() detokenizer = MosesDetokenizer()
class POSifiedText(markovify.Text): class MarkovText(markovify.Text):
def word_split(self, sentence): def word_split(self, sentence):
return tokenizer.tokenize(sentence) return tokenizer.tokenize(sentence)
def word_join(self, words): def word_join(self, words):
return detokenizer.detokenize(words, return_str=True) return detokenizer.detokenize(words, return_str=True)
class MarkovUserName(markovify.Text):
def word_split(self, word):
return list(word)
def word_join(self, characters):
return "".join(characters)

View file

@ -1,7 +1,6 @@
from xml.etree import ElementTree from xml.etree import ElementTree
import jsonlines import jsonlines
from bs4 import BeautifulSoup
from utils import * from utils import *
@ -22,8 +21,7 @@ def parse_posts(inputdir, outputdir):
titles.write(title) titles.write(title)
body = element.get('Body') body = element.get('Body')
if body: if body:
soup = BeautifulSoup(body, "lxml") text = html2text(body)
text = soup.get_text()
if element.get('PostTypeId') == "1": if element.get('PostTypeId') == "1":
questions.write(text) questions.write(text)
else: else:

View file

@ -1,38 +1,41 @@
import jsonlines import jsonlines
import markovify import markovify
import os
from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer
from markov import POSifiedText from markov import MarkovText, MarkovUserName
from utils import * from utils import *
detokenizer = MosesDetokenizer()
BASEDIR, mode = get_settings(2) def get_markov(mode):
if mode not in ["Questions", "Answers", "Titles"]: if mode == "Usernames":
print("error") return MarkovUserName
exit() else:
chainfile = BASEDIR + '/{type}.chain.json'.format(type=mode) return MarkovText
try:
def get_state_size(mode):
return 1 if mode == "Titles" else 3 if mode == "Usernames" else 2
def load_chain(chainfile, mode):
markov = get_markov(mode)
with open(chainfile, 'r') as myfile: with open(chainfile, 'r') as myfile:
data = myfile.read() data = myfile.read()
chain = POSifiedText.from_json(data)
# raise FileNotFoundError
print("using existing file\n") print("using existing file\n")
return markov.from_json(data)
except FileNotFoundError:
tokenizer = MosesTokenizer()
def generate_chain(basedir, mode):
combined_cains = None combined_cains = None
chainlist = [] chainlist = []
markov = get_markov(mode)
i = 0 i = 0
with jsonlines.open(BASEDIR + "/{type}.jsonl".format(type=mode), mode="r") as content: with jsonlines.open(basedir + "/{type}.jsonl".format(type=mode), mode="r") as content:
for text in content: for text in content:
text = text.strip() text = text.strip()
# tokens = tokenizer.tokenize(text=text.replace("\n", " THISISANEWLINE "))
try: try:
chain = POSifiedText(text, (1 if mode == "Titles" else 2), retain_original=False) chain = markov(text, get_state_size(mode), retain_original=False)
# chain = markovify.Chain([tokens], (1 if mode == "Titles" else 2))
except KeyError: except KeyError:
continue continue
chainlist.append(chain) chainlist.append(chain)
@ -50,16 +53,30 @@ except FileNotFoundError:
chain = markovify.combine([combined_cains, subtotal_chain]) chain = markovify.combine([combined_cains, subtotal_chain])
with open(chainfile, 'w') as outfile: with open(chainfile, 'w') as outfile:
outfile.write(chain.to_json()) outfile.write(chain.to_json())
print_ram()
return chain
for _ in range(10):
# walk = []
# for text in chain.gen():
# if len(walk) > 100:
# break
# walk.append(text)
# result = detokenizer.detokenize(walk, return_str=True)
# print(result.replace("THISISANEWLINE ", "\n"))
print(chain.make_sentence())
print("-----------------------------------")
print_ram() if __name__ == "__main__":
basedir, mode = get_settings(2)
if mode not in ["Questions", "Answers", "Titles", "Usernames"]:
print("error")
exit()
chainfile = basedir + '/{type}.chain.json'.format(type=mode)
if os.path.exists(chainfile):
chain = load_chain(chainfile, mode)
else:
chain = generate_chain(basedir, mode)
for _ in range(10):
# walk = []
# for text in chain.gen():
# if len(walk) > 100:
# break
# walk.append(text)
# result = detokenizer.detokenize(walk, return_str=True)
# print(result.replace("THISISANEWLINE ", "\n"))
print(chain.make_sentence())
print("-----------------------------------")
print_ram()

View file

@ -2,6 +2,8 @@ import sys
import resource import resource
from bs4 import BeautifulSoup
def print_stats(i, skipped=None): def print_stats(i, skipped=None):
print("{number} total entries".format(number=i)) print("{number} total entries".format(number=i))
@ -14,6 +16,13 @@ def print_ram():
print("used {mb}MB".format(mb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024)) print("used {mb}MB".format(mb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024))
def html2text(body):
soup = BeautifulSoup(body, "lxml")
for code in soup.find_all("code"):
code.decompose()
return soup.get_text()
def get_settings(count): def get_settings(count):
if len(sys.argv) != count + 1: if len(sys.argv) != count + 1:
if count == 1: if count == 1: