import jsonlines from lxml import etree from utils import * def parse_posts(inputdir, outputdir): i = 0 skipped = 0 iterator = etree.iterparse(inputdir + "/Posts.xml", events=("start", "end")) _, root = next(iterator) with jsonlines.open(outputdir + "/Questions.jsonl", mode="w") as questions, \ jsonlines.open(outputdir + "/Answers.jsonl", mode="w") as answers, \ jsonlines.open(outputdir + "/Titles.jsonl", "w") as titles: for event, element in iterator: title = element.get("Title") # if element.get("Score") and int(element.get("Score")) <= 10: # skipped += 1 # element.clear() # continue if title: titles.write(title) body = element.get("Body") if body: text = html2text(body) if element.get("PostTypeId") == "1": questions.write(text) else: answers.write(text) element.clear() if i % 100 == 0: root.clear() print(i, skipped, i + skipped, end="\r") i += 1 print_stats(i, skipped) def parse_comments(inputdir, outputdir): i = 0 iterator = etree.iterparse(inputdir + "/Comments.xml", events=("start", "end")) _, root = next(iterator) with jsonlines.open(outputdir + "/Comments.jsonl", mode="w") as comments: for event, element in iterator: text = element.get("Text") if text: comments.write(text) element.clear() if i % 100 == 0: root.clear() print(i, end="\r") i += 1 print_stats(i) def parse_usernames(inputdir, outputdir): i = 0 iterator = etree.iterparse(inputdir + "/Users.xml", events=("start", "end")) _, root = next(iterator) with jsonlines.open(outputdir + "/Usernames.jsonl", mode="w") as usernames: for event, element in iterator: displayname = element.get("DisplayName") if displayname: usernames.write(displayname) element.clear() if i % 100 == 0: root.clear() print(i, end="\r") i += 1 print_stats(i) if __name__ == "__main__": settings = get_settings(1) # parse_posts(settings) # parse_comments(settings) # parse_comments(settings)