1
0
Fork 0
mirror of https://github.com/Findus23/se-simulator.git synced 2024-09-19 15:53:45 +02:00
se-simulator/parsexml.py
2018-03-15 21:38:18 +01:00

80 lines
2.4 KiB
Python

import resource
from xml.etree import ElementTree
import jsonlines
from bs4 import BeautifulSoup
import utils
def print_stats(i, skipped=None):
print("{number} total entries".format(number=i))
if skipped:
print("{number} skipped".format(number=skipped))
print("used {mb}MB".format(mb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024))
def parse_posts(basedir):
i = 0
skipped = 0
iterator = ElementTree.iterparse(basedir + "/Posts.xml")
with jsonlines.open(basedir + '/Questions.jsonl', mode="w") as questions, \
jsonlines.open(basedir + '/Answers.jsonl', mode="w") as answers, \
jsonlines.open(basedir + "/Titles.jsonl", "w") as titles:
for event, element in iterator:
title = element.get('Title')
# if element.get('Score') and int(element.get('Score')) > 2:
# skipped += 1
# continue
if title:
titles.write(title)
body = element.get('Body')
if body:
soup = BeautifulSoup(body, "lxml")
text = soup.get_text()
if element.get('PostTypeId') == "1":
questions.write(text)
else:
answers.write(text)
element.clear()
if i % 100 == 0:
print(i)
i += 1
print_stats(i, skipped)
def parse_comments(basedir):
i = 0
iterator = ElementTree.iterparse(basedir + "/Comments.xml")
with jsonlines.open(basedir + '/Comments.jsonl', mode="w") as comments:
for event, element in iterator:
text = element.get('Text')
if text:
comments.write(text)
element.clear()
if i % 100 == 0:
print(i)
i += 1
print_stats(i)
def parse_usernames(basedir):
i = 0
iterator = ElementTree.iterparse(basedir + "/Users.xml")
with jsonlines.open(basedir + '/Usernames.jsonl', mode="w") as usernames:
for event, element in iterator:
displayname = element.get('DisplayName')
if displayname:
usernames.write(displayname)
element.clear()
if i % 100 == 0:
print(i)
i += 1
print_stats(i)
if __name__ == "__main__":
settings=utils.get_settings(1)
parse_posts(settings)
parse_comments(settings)
parse_comments(settings)