mirror of
https://github.com/Findus23/se-simulator.git
synced 2024-09-19 15:53:45 +02:00
improve generator
This commit is contained in:
parent
6dd5438973
commit
bb3b691caa
5 changed files with 64 additions and 32 deletions
|
@ -7,7 +7,7 @@ from parsexml import parse_posts, parse_comments, parse_usernames
|
||||||
from utils import *
|
from utils import *
|
||||||
# os.chdir("/mydir")
|
# os.chdir("/mydir")
|
||||||
for file in glob.glob("downloads/**/*.7z"):
|
for file in glob.glob("downloads/**/*.7z"):
|
||||||
if not "worldbuilding" in file:
|
if not "raspberry" in file:
|
||||||
continue
|
continue
|
||||||
code = os.path.basename(os.path.splitext(file)[0])
|
code = os.path.basename(os.path.splitext(file)[0])
|
||||||
print(code)
|
print(code)
|
||||||
|
|
10
markov.py
10
markov.py
|
@ -5,9 +5,17 @@ tokenizer = MosesTokenizer()
|
||||||
detokenizer = MosesDetokenizer()
|
detokenizer = MosesDetokenizer()
|
||||||
|
|
||||||
|
|
||||||
class POSifiedText(markovify.Text):
|
class MarkovText(markovify.Text):
|
||||||
def word_split(self, sentence):
|
def word_split(self, sentence):
|
||||||
return tokenizer.tokenize(sentence)
|
return tokenizer.tokenize(sentence)
|
||||||
|
|
||||||
def word_join(self, words):
|
def word_join(self, words):
|
||||||
return detokenizer.detokenize(words, return_str=True)
|
return detokenizer.detokenize(words, return_str=True)
|
||||||
|
|
||||||
|
|
||||||
|
class MarkovUserName(markovify.Text):
|
||||||
|
def word_split(self, word):
|
||||||
|
return list(word)
|
||||||
|
|
||||||
|
def word_join(self, characters):
|
||||||
|
return "".join(characters)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
import jsonlines
|
import jsonlines
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
|
@ -22,8 +21,7 @@ def parse_posts(inputdir, outputdir):
|
||||||
titles.write(title)
|
titles.write(title)
|
||||||
body = element.get('Body')
|
body = element.get('Body')
|
||||||
if body:
|
if body:
|
||||||
soup = BeautifulSoup(body, "lxml")
|
text = html2text(body)
|
||||||
text = soup.get_text()
|
|
||||||
if element.get('PostTypeId') == "1":
|
if element.get('PostTypeId') == "1":
|
||||||
questions.write(text)
|
questions.write(text)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,38 +1,41 @@
|
||||||
import jsonlines
|
import jsonlines
|
||||||
import markovify
|
import markovify
|
||||||
|
import os
|
||||||
from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer
|
from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer
|
||||||
|
|
||||||
from markov import POSifiedText
|
from markov import MarkovText, MarkovUserName
|
||||||
from utils import *
|
from utils import *
|
||||||
|
|
||||||
detokenizer = MosesDetokenizer()
|
|
||||||
|
|
||||||
BASEDIR, mode = get_settings(2)
|
def get_markov(mode):
|
||||||
if mode not in ["Questions", "Answers", "Titles"]:
|
if mode == "Usernames":
|
||||||
print("error")
|
return MarkovUserName
|
||||||
exit()
|
else:
|
||||||
chainfile = BASEDIR + '/{type}.chain.json'.format(type=mode)
|
return MarkovText
|
||||||
|
|
||||||
try:
|
|
||||||
|
def get_state_size(mode):
|
||||||
|
return 1 if mode == "Titles" else 3 if mode == "Usernames" else 2
|
||||||
|
|
||||||
|
|
||||||
|
def load_chain(chainfile, mode):
|
||||||
|
markov = get_markov(mode)
|
||||||
with open(chainfile, 'r') as myfile:
|
with open(chainfile, 'r') as myfile:
|
||||||
data = myfile.read()
|
data = myfile.read()
|
||||||
chain = POSifiedText.from_json(data)
|
|
||||||
# raise FileNotFoundError
|
|
||||||
print("using existing file\n")
|
print("using existing file\n")
|
||||||
|
return markov.from_json(data)
|
||||||
|
|
||||||
except FileNotFoundError:
|
|
||||||
tokenizer = MosesTokenizer()
|
|
||||||
|
|
||||||
|
def generate_chain(basedir, mode):
|
||||||
combined_cains = None
|
combined_cains = None
|
||||||
chainlist = []
|
chainlist = []
|
||||||
|
markov = get_markov(mode)
|
||||||
i = 0
|
i = 0
|
||||||
with jsonlines.open(BASEDIR + "/{type}.jsonl".format(type=mode), mode="r") as content:
|
with jsonlines.open(basedir + "/{type}.jsonl".format(type=mode), mode="r") as content:
|
||||||
for text in content:
|
for text in content:
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
# tokens = tokenizer.tokenize(text=text.replace("\n", " THISISANEWLINE "))
|
|
||||||
try:
|
try:
|
||||||
chain = POSifiedText(text, (1 if mode == "Titles" else 2), retain_original=False)
|
chain = markov(text, get_state_size(mode), retain_original=False)
|
||||||
# chain = markovify.Chain([tokens], (1 if mode == "Titles" else 2))
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
continue
|
continue
|
||||||
chainlist.append(chain)
|
chainlist.append(chain)
|
||||||
|
@ -50,8 +53,22 @@ except FileNotFoundError:
|
||||||
chain = markovify.combine([combined_cains, subtotal_chain])
|
chain = markovify.combine([combined_cains, subtotal_chain])
|
||||||
with open(chainfile, 'w') as outfile:
|
with open(chainfile, 'w') as outfile:
|
||||||
outfile.write(chain.to_json())
|
outfile.write(chain.to_json())
|
||||||
|
print_ram()
|
||||||
|
return chain
|
||||||
|
|
||||||
for _ in range(10):
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
basedir, mode = get_settings(2)
|
||||||
|
if mode not in ["Questions", "Answers", "Titles", "Usernames"]:
|
||||||
|
print("error")
|
||||||
|
exit()
|
||||||
|
chainfile = basedir + '/{type}.chain.json'.format(type=mode)
|
||||||
|
if os.path.exists(chainfile):
|
||||||
|
chain = load_chain(chainfile, mode)
|
||||||
|
else:
|
||||||
|
chain = generate_chain(basedir, mode)
|
||||||
|
|
||||||
|
for _ in range(10):
|
||||||
# walk = []
|
# walk = []
|
||||||
# for text in chain.gen():
|
# for text in chain.gen():
|
||||||
# if len(walk) > 100:
|
# if len(walk) > 100:
|
||||||
|
@ -62,4 +79,4 @@ for _ in range(10):
|
||||||
print(chain.make_sentence())
|
print(chain.make_sentence())
|
||||||
print("-----------------------------------")
|
print("-----------------------------------")
|
||||||
|
|
||||||
print_ram()
|
print_ram()
|
||||||
|
|
9
utils.py
9
utils.py
|
@ -2,6 +2,8 @@ import sys
|
||||||
|
|
||||||
import resource
|
import resource
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
def print_stats(i, skipped=None):
|
def print_stats(i, skipped=None):
|
||||||
print("{number} total entries".format(number=i))
|
print("{number} total entries".format(number=i))
|
||||||
|
@ -14,6 +16,13 @@ def print_ram():
|
||||||
print("used {mb}MB".format(mb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024))
|
print("used {mb}MB".format(mb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024))
|
||||||
|
|
||||||
|
|
||||||
|
def html2text(body):
|
||||||
|
soup = BeautifulSoup(body, "lxml")
|
||||||
|
for code in soup.find_all("code"):
|
||||||
|
code.decompose()
|
||||||
|
return soup.get_text()
|
||||||
|
|
||||||
|
|
||||||
def get_settings(count):
|
def get_settings(count):
|
||||||
if len(sys.argv) != count + 1:
|
if len(sys.argv) != count + 1:
|
||||||
if count == 1:
|
if count == 1:
|
||||||
|
|
Loading…
Reference in a new issue