mirror of
https://github.com/Findus23/se-simulator.git
synced 2024-09-19 15:53:45 +02:00
improvements for stackoverflow
This commit is contained in:
parent
ad6ce31712
commit
a015f05f9e
3 changed files with 15 additions and 9 deletions
20
parsexml.py
20
parsexml.py
|
@ -1,6 +1,5 @@
|
|||
from xml.etree import ElementTree
|
||||
|
||||
import jsonlines
|
||||
from lxml import etree
|
||||
|
||||
from utils import *
|
||||
|
||||
|
@ -8,14 +7,16 @@ from utils import *
|
|||
def parse_posts(inputdir, outputdir):
|
||||
i = 0
|
||||
skipped = 0
|
||||
iterator = ElementTree.iterparse(inputdir + "/Posts.xml")
|
||||
iterator = etree.iterparse(inputdir + "/Posts.xml", events=("start", "end"))
|
||||
_, root = next(iterator)
|
||||
with jsonlines.open(outputdir + '/Questions.jsonl', mode="w") as questions, \
|
||||
jsonlines.open(outputdir + '/Answers.jsonl', mode="w") as answers, \
|
||||
jsonlines.open(outputdir + "/Titles.jsonl", "w") as titles:
|
||||
for event, element in iterator:
|
||||
title = element.get('Title')
|
||||
# if element.get('Score') and int(element.get('Score')) > 2:
|
||||
# if element.get('Score') and int(element.get('Score')) <= 10:
|
||||
# skipped += 1
|
||||
# element.clear()
|
||||
# continue
|
||||
if title:
|
||||
titles.write(title)
|
||||
|
@ -28,14 +29,16 @@ def parse_posts(inputdir, outputdir):
|
|||
answers.write(text)
|
||||
element.clear()
|
||||
if i % 100 == 0:
|
||||
print(i, end="\r")
|
||||
root.clear()
|
||||
print(i, skipped, i + skipped, end="\r")
|
||||
i += 1
|
||||
print_stats(i, skipped)
|
||||
|
||||
|
||||
def parse_comments(inputdir, outputdir):
|
||||
i = 0
|
||||
iterator = ElementTree.iterparse(inputdir + "/Comments.xml")
|
||||
iterator = etree.iterparse(inputdir + "/Comments.xml", events=("start", "end"))
|
||||
_, root = next(iterator)
|
||||
with jsonlines.open(outputdir + '/Comments.jsonl', mode="w") as comments:
|
||||
for event, element in iterator:
|
||||
text = element.get('Text')
|
||||
|
@ -43,6 +46,7 @@ def parse_comments(inputdir, outputdir):
|
|||
comments.write(text)
|
||||
element.clear()
|
||||
if i % 100 == 0:
|
||||
root.clear()
|
||||
print(i, end="\r")
|
||||
i += 1
|
||||
print_stats(i)
|
||||
|
@ -50,7 +54,8 @@ def parse_comments(inputdir, outputdir):
|
|||
|
||||
def parse_usernames(inputdir, outputdir):
|
||||
i = 0
|
||||
iterator = ElementTree.iterparse(inputdir + "/Users.xml")
|
||||
iterator = etree.iterparse(inputdir + "/Users.xml", events=("start", "end"))
|
||||
_, root = next(iterator)
|
||||
with jsonlines.open(outputdir + '/Usernames.jsonl', mode="w") as usernames:
|
||||
for event, element in iterator:
|
||||
displayname = element.get('DisplayName')
|
||||
|
@ -58,6 +63,7 @@ def parse_usernames(inputdir, outputdir):
|
|||
usernames.write(displayname)
|
||||
element.clear()
|
||||
if i % 100 == 0:
|
||||
root.clear()
|
||||
print(i, end="\r")
|
||||
i += 1
|
||||
print_stats(i)
|
||||
|
|
|
@ -41,7 +41,7 @@ def generate_chain(sourcedir, chainfile, mode):
|
|||
chainlist.append(chain)
|
||||
if i % 100 == 0:
|
||||
print(i, end="\r")
|
||||
if i % 1000 == 0:
|
||||
if i % 10000 == 0:
|
||||
subtotal_chain = markovify.combine(chainlist)
|
||||
if not combined_cains:
|
||||
combined_cains = subtotal_chain
|
||||
|
|
2
utils.py
2
utils.py
|
@ -24,7 +24,7 @@ def html2text(body):
|
|||
soup = BeautifulSoup(body, "lxml")
|
||||
for code in soup.find_all("code"):
|
||||
code.decompose()
|
||||
return soup.get_text()
|
||||
return str(soup.get_text())
|
||||
|
||||
|
||||
def get_files():
|
||||
|
|
Loading…
Reference in a new issue