2018-03-10 19:28:02 +01:00
|
|
|
from xml.etree import ElementTree
|
|
|
|
|
|
|
|
import jsonlines
|
|
|
|
|
2018-03-16 18:48:54 +01:00
|
|
|
from utils import *
|
2018-03-10 19:28:02 +01:00
|
|
|
|
2018-03-15 21:38:18 +01:00
|
|
|
|
2018-03-16 18:48:54 +01:00
|
|
|
def parse_posts(inputdir, outputdir):
|
2018-03-15 21:38:18 +01:00
|
|
|
i = 0
|
|
|
|
skipped = 0
|
2018-03-16 18:48:54 +01:00
|
|
|
iterator = ElementTree.iterparse(inputdir + "/Posts.xml")
|
|
|
|
with jsonlines.open(outputdir + '/Questions.jsonl', mode="w") as questions, \
|
|
|
|
jsonlines.open(outputdir + '/Answers.jsonl', mode="w") as answers, \
|
|
|
|
jsonlines.open(outputdir + "/Titles.jsonl", "w") as titles:
|
2018-03-15 21:38:18 +01:00
|
|
|
for event, element in iterator:
|
|
|
|
title = element.get('Title')
|
|
|
|
# if element.get('Score') and int(element.get('Score')) > 2:
|
|
|
|
# skipped += 1
|
|
|
|
# continue
|
|
|
|
if title:
|
|
|
|
titles.write(title)
|
|
|
|
body = element.get('Body')
|
|
|
|
if body:
|
2018-03-16 20:31:43 +01:00
|
|
|
text = html2text(body)
|
2018-03-15 21:38:18 +01:00
|
|
|
if element.get('PostTypeId') == "1":
|
|
|
|
questions.write(text)
|
|
|
|
else:
|
|
|
|
answers.write(text)
|
|
|
|
element.clear()
|
|
|
|
if i % 100 == 0:
|
2018-04-03 21:52:18 +02:00
|
|
|
print(i, end="\r")
|
2018-03-15 21:38:18 +01:00
|
|
|
i += 1
|
|
|
|
print_stats(i, skipped)
|
|
|
|
|
|
|
|
|
2018-03-16 18:48:54 +01:00
|
|
|
def parse_comments(inputdir, outputdir):
|
2018-03-15 21:38:18 +01:00
|
|
|
i = 0
|
2018-03-16 18:48:54 +01:00
|
|
|
iterator = ElementTree.iterparse(inputdir + "/Comments.xml")
|
|
|
|
with jsonlines.open(outputdir + '/Comments.jsonl', mode="w") as comments:
|
2018-03-15 21:38:18 +01:00
|
|
|
for event, element in iterator:
|
|
|
|
text = element.get('Text')
|
|
|
|
if text:
|
|
|
|
comments.write(text)
|
|
|
|
element.clear()
|
|
|
|
if i % 100 == 0:
|
2018-04-03 21:52:18 +02:00
|
|
|
print(i, end="\r")
|
2018-03-15 21:38:18 +01:00
|
|
|
i += 1
|
|
|
|
print_stats(i)
|
|
|
|
|
|
|
|
|
2018-03-16 18:48:54 +01:00
|
|
|
def parse_usernames(inputdir, outputdir):
|
2018-03-15 21:38:18 +01:00
|
|
|
i = 0
|
2018-03-16 18:48:54 +01:00
|
|
|
iterator = ElementTree.iterparse(inputdir + "/Users.xml")
|
|
|
|
with jsonlines.open(outputdir + '/Usernames.jsonl', mode="w") as usernames:
|
2018-03-15 21:38:18 +01:00
|
|
|
for event, element in iterator:
|
|
|
|
displayname = element.get('DisplayName')
|
|
|
|
if displayname:
|
|
|
|
usernames.write(displayname)
|
|
|
|
element.clear()
|
|
|
|
if i % 100 == 0:
|
2018-04-03 21:52:18 +02:00
|
|
|
print(i, end="\r")
|
2018-03-15 21:38:18 +01:00
|
|
|
i += 1
|
|
|
|
print_stats(i)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2018-03-16 18:48:54 +01:00
|
|
|
settings = get_settings(1)
|
|
|
|
# parse_posts(settings)
|
|
|
|
# parse_comments(settings)
|
|
|
|
# parse_comments(settings)
|