52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import List
|
|
from xml.dom import minidom
|
|
|
|
now = datetime(2018, 5, 1, 0, 0, 0, 0)
|
|
import numpy as np
|
|
|
|
|
|
@dataclass
|
|
class Post:
|
|
score: int
|
|
creation_date: datetime
|
|
view_count: int
|
|
body_length: int
|
|
answer_count: int
|
|
comment_count: int
|
|
favourite_count: int
|
|
|
|
@property
|
|
def age(self) -> int:
|
|
return round((now - self.creation_date).total_seconds())
|
|
|
|
@property
|
|
def as_list(self) -> List:
|
|
return [self.score, self.age, self.view_count, self.body_length,
|
|
self.answer_count, self.comment_count, self.favourite_count]
|
|
|
|
|
|
inputfile = "/media/datenneu/se-simulator/raw/astronomy.stackexchange.com/Posts.xml"
|
|
xmldoc = minidom.parse(inputfile)
|
|
itemlist = xmldoc.getElementsByTagName("row")
|
|
rawdata = []
|
|
for s in itemlist:
|
|
if s.attributes["PostTypeId"].value != "1":
|
|
continue
|
|
basictime = ".".join(s.attributes["CreationDate"].value.split(".")[:-1]) # get rid of decimal seconds
|
|
favourite_count = int(s.attributes["FavoriteCount"].value) if "FavoriteCount" in s.attributes else 0
|
|
post = Post(
|
|
score=int(s.attributes["Score"].value),
|
|
creation_date=datetime.strptime(basictime, "%Y-%m-%dT%H:%M:%S"),
|
|
view_count=int(s.attributes["ViewCount"].value),
|
|
body_length=len(s.attributes["Body"].value),
|
|
answer_count=int(s.attributes["AnswerCount"].value),
|
|
comment_count=int(s.attributes["CommentCount"].value),
|
|
favourite_count=favourite_count,
|
|
)
|
|
rawdata.append(post.as_list)
|
|
|
|
data = np.array(rawdata)
|
|
print(data.dtype)
|
|
np.savetxt("data.txt", data,fmt="%d")
|