1
0
Fork 0
mirror of https://github.com/Findus23/se-simulator.git synced 2024-09-19 15:53:45 +02:00
se-simulator/parsexml.py

77 lines
2.4 KiB
Python
Raw Normal View History

2018-03-10 19:28:02 +01:00
import jsonlines
2018-04-27 20:48:32 +02:00
from lxml import etree
2018-03-10 19:28:02 +01:00
2018-03-16 18:48:54 +01:00
from utils import *
2018-03-10 19:28:02 +01:00
2018-03-15 21:38:18 +01:00
2018-03-16 18:48:54 +01:00
def parse_posts(inputdir, outputdir):
2018-03-15 21:38:18 +01:00
i = 0
skipped = 0
2018-04-27 20:48:32 +02:00
iterator = etree.iterparse(inputdir + "/Posts.xml", events=("start", "end"))
_, root = next(iterator)
2018-05-13 13:47:50 +02:00
with jsonlines.open(outputdir + "/Questions.jsonl", mode="w") as questions, \
jsonlines.open(outputdir + "/Answers.jsonl", mode="w") as answers, \
2018-03-16 18:48:54 +01:00
jsonlines.open(outputdir + "/Titles.jsonl", "w") as titles:
2018-03-15 21:38:18 +01:00
for event, element in iterator:
2018-05-13 13:47:50 +02:00
title = element.get("Title")
# if element.get("Score") and int(element.get("Score")) <= 10:
2018-03-15 21:38:18 +01:00
# skipped += 1
2018-04-27 20:48:32 +02:00
# element.clear()
2018-03-15 21:38:18 +01:00
# continue
if title:
titles.write(title)
2018-05-13 13:47:50 +02:00
body = element.get("Body")
2018-03-15 21:38:18 +01:00
if body:
2018-03-16 20:31:43 +01:00
text = html2text(body)
2018-05-13 13:47:50 +02:00
if element.get("PostTypeId") == "1":
2018-03-15 21:38:18 +01:00
questions.write(text)
else:
answers.write(text)
element.clear()
if i % 100 == 0:
2018-04-27 20:48:32 +02:00
root.clear()
print(i, skipped, i + skipped, end="\r")
2018-03-15 21:38:18 +01:00
i += 1
print_stats(i, skipped)
2018-03-16 18:48:54 +01:00
def parse_comments(inputdir, outputdir):
2018-03-15 21:38:18 +01:00
i = 0
2018-04-27 20:48:32 +02:00
iterator = etree.iterparse(inputdir + "/Comments.xml", events=("start", "end"))
_, root = next(iterator)
2018-05-13 13:47:50 +02:00
with jsonlines.open(outputdir + "/Comments.jsonl", mode="w") as comments:
2018-03-15 21:38:18 +01:00
for event, element in iterator:
2018-05-13 13:47:50 +02:00
text = element.get("Text")
2018-03-15 21:38:18 +01:00
if text:
comments.write(text)
element.clear()
if i % 100 == 0:
2018-04-27 20:48:32 +02:00
root.clear()
2018-04-03 21:52:18 +02:00
print(i, end="\r")
2018-03-15 21:38:18 +01:00
i += 1
print_stats(i)
2018-03-16 18:48:54 +01:00
def parse_usernames(inputdir, outputdir):
2018-03-15 21:38:18 +01:00
i = 0
2018-04-27 20:48:32 +02:00
iterator = etree.iterparse(inputdir + "/Users.xml", events=("start", "end"))
_, root = next(iterator)
2018-05-13 13:47:50 +02:00
with jsonlines.open(outputdir + "/Usernames.jsonl", mode="w") as usernames:
2018-03-15 21:38:18 +01:00
for event, element in iterator:
2018-05-13 13:47:50 +02:00
displayname = element.get("DisplayName")
2018-03-15 21:38:18 +01:00
if displayname:
usernames.write(displayname)
element.clear()
if i % 100 == 0:
2018-04-27 20:48:32 +02:00
root.clear()
2018-04-03 21:52:18 +02:00
print(i, end="\r")
2018-03-15 21:38:18 +01:00
i += 1
print_stats(i)
if __name__ == "__main__":
2018-03-16 18:48:54 +01:00
settings = get_settings(1)
# parse_posts(settings)
# parse_comments(settings)
# parse_comments(settings)