1
0
Fork 0

stack_analysis

This commit is contained in:
Lukas Winkler 2019-02-18 16:26:19 +01:00
parent 449d293f85
commit ee53e8cc7c
3 changed files with 5846 additions and 0 deletions

5727
pcatests/data.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,67 @@
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=1000, edgeitems=4)
def print_heading(text):
print(f" {text} ".center(80, "-"))
data = np.loadtxt("data.txt")
labels = ["score", "age", "view_count", "body_length", "answer_count", "comment_count", "favourite_count"]
print_heading("raw data") ############################
print(data)
print_heading("scaled") ############################
scaler = StandardScaler()
scaler.fit(data)
x = scaler.transform(data)
# x=data
print(x)
n = 7
pca = PCA(n_components=n)
pca.fit(x)
print_heading("components") ############################
print(pca.components_.shape) # eigenvectors of covariance matrix
print(pca.components_)
print_heading("explained_variance") ############################
print(pca.explained_variance_) # n largest eigenvalues of covariance matrix
print(pca.explained_variance_ratio_, "(as ratio)")
print_heading("covariance") ############################
print(pca.get_covariance().shape) # eigenvectors
print(pca.get_covariance())
print_heading("transformed") ############################
x_new = pca.transform(x)
print(x_new.shape)
print(x_new)
print_heading("inverse transformed and undone scale") ############################
x_simple = scaler.inverse_transform(pca.inverse_transform(x_new))
print(x_simple.shape)
print(x_simple)
print(pca.explained_variance_)
plt.scatter(data[::, 0], data[::, 4], s=1)
plt.scatter(x_simple[::, 0], x_simple[::, 4], s=1)
plt.show()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.show()
# plot correclation matrix
cov = pca.get_covariance()
plt.matshow(cov)
plt.xticks(range(len(labels)), labels,rotation=90)
plt.yticks(range(len(labels)), labels)
plt.colorbar()
plt.show()

52
pcatests/stack_import.py Normal file
View file

@ -0,0 +1,52 @@
from dataclasses import dataclass
from datetime import datetime
from typing import List
from xml.dom import minidom
now = datetime(2018, 5, 1, 0, 0, 0, 0)
import numpy as np
@dataclass
class Post:
score: int
creation_date: datetime
view_count: int
body_length: int
answer_count: int
comment_count: int
favourite_count: int
@property
def age(self) -> int:
return round((now - self.creation_date).total_seconds())
@property
def as_list(self) -> List:
return [self.score, self.age, self.view_count, self.body_length,
self.answer_count, self.comment_count, self.favourite_count]
inputfile = "/media/datenneu/se-simulator/raw/astronomy.stackexchange.com/Posts.xml"
xmldoc = minidom.parse(inputfile)
itemlist = xmldoc.getElementsByTagName("row")
rawdata = []
for s in itemlist:
if s.attributes["PostTypeId"].value != "1":
continue
basictime = ".".join(s.attributes["CreationDate"].value.split(".")[:-1]) # get rid of decimal seconds
favourite_count = int(s.attributes["FavoriteCount"].value) if "FavoriteCount" in s.attributes else 0
post = Post(
score=int(s.attributes["Score"].value),
creation_date=datetime.strptime(basictime, "%Y-%m-%dT%H:%M:%S"),
view_count=int(s.attributes["ViewCount"].value),
body_length=len(s.attributes["Body"].value),
answer_count=int(s.attributes["AnswerCount"].value),
comment_count=int(s.attributes["CommentCount"].value),
favourite_count=favourite_count,
)
rawdata.append(post.as_list)
data = np.array(rawdata)
print(data.dtype)
np.savetxt("data.txt", data,fmt="%d")