stack_analysis
This commit is contained in:
parent
449d293f85
commit
ee53e8cc7c
3 changed files with 5846 additions and 0 deletions
5727
pcatests/data.txt
Normal file
5727
pcatests/data.txt
Normal file
File diff suppressed because it is too large
Load diff
67
pcatests/stack_analysis.py
Normal file
67
pcatests/stack_analysis.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
np.set_printoptions(linewidth=1000, edgeitems=4)
|
||||
|
||||
|
||||
def print_heading(text):
|
||||
print(f" {text} ".center(80, "-"))
|
||||
|
||||
|
||||
data = np.loadtxt("data.txt")
|
||||
labels = ["score", "age", "view_count", "body_length", "answer_count", "comment_count", "favourite_count"]
|
||||
print_heading("raw data") ############################
|
||||
|
||||
print(data)
|
||||
print_heading("scaled") ############################
|
||||
|
||||
scaler = StandardScaler()
|
||||
scaler.fit(data)
|
||||
x = scaler.transform(data)
|
||||
# x=data
|
||||
print(x)
|
||||
n = 7
|
||||
pca = PCA(n_components=n)
|
||||
pca.fit(x)
|
||||
print_heading("components") ############################
|
||||
print(pca.components_.shape) # eigenvectors of covariance matrix
|
||||
print(pca.components_)
|
||||
print_heading("explained_variance") ############################
|
||||
|
||||
print(pca.explained_variance_) # n largest eigenvalues of covariance matrix
|
||||
print(pca.explained_variance_ratio_, "(as ratio)")
|
||||
print_heading("covariance") ############################
|
||||
|
||||
print(pca.get_covariance().shape) # eigenvectors
|
||||
print(pca.get_covariance())
|
||||
|
||||
print_heading("transformed") ############################
|
||||
|
||||
x_new = pca.transform(x)
|
||||
print(x_new.shape)
|
||||
print(x_new)
|
||||
print_heading("inverse transformed and undone scale") ############################
|
||||
|
||||
x_simple = scaler.inverse_transform(pca.inverse_transform(x_new))
|
||||
|
||||
print(x_simple.shape)
|
||||
print(x_simple)
|
||||
|
||||
print(pca.explained_variance_)
|
||||
|
||||
plt.scatter(data[::, 0], data[::, 4], s=1)
|
||||
plt.scatter(x_simple[::, 0], x_simple[::, 4], s=1)
|
||||
plt.show()
|
||||
|
||||
plt.plot(np.cumsum(pca.explained_variance_ratio_))
|
||||
plt.show()
|
||||
|
||||
# plot correclation matrix
|
||||
cov = pca.get_covariance()
|
||||
plt.matshow(cov)
|
||||
plt.xticks(range(len(labels)), labels,rotation=90)
|
||||
plt.yticks(range(len(labels)), labels)
|
||||
plt.colorbar()
|
||||
|
||||
plt.show()
|
52
pcatests/stack_import.py
Normal file
52
pcatests/stack_import.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from xml.dom import minidom
|
||||
|
||||
now = datetime(2018, 5, 1, 0, 0, 0, 0)
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
score: int
|
||||
creation_date: datetime
|
||||
view_count: int
|
||||
body_length: int
|
||||
answer_count: int
|
||||
comment_count: int
|
||||
favourite_count: int
|
||||
|
||||
@property
|
||||
def age(self) -> int:
|
||||
return round((now - self.creation_date).total_seconds())
|
||||
|
||||
@property
|
||||
def as_list(self) -> List:
|
||||
return [self.score, self.age, self.view_count, self.body_length,
|
||||
self.answer_count, self.comment_count, self.favourite_count]
|
||||
|
||||
|
||||
inputfile = "/media/datenneu/se-simulator/raw/astronomy.stackexchange.com/Posts.xml"
|
||||
xmldoc = minidom.parse(inputfile)
|
||||
itemlist = xmldoc.getElementsByTagName("row")
|
||||
rawdata = []
|
||||
for s in itemlist:
|
||||
if s.attributes["PostTypeId"].value != "1":
|
||||
continue
|
||||
basictime = ".".join(s.attributes["CreationDate"].value.split(".")[:-1]) # get rid of decimal seconds
|
||||
favourite_count = int(s.attributes["FavoriteCount"].value) if "FavoriteCount" in s.attributes else 0
|
||||
post = Post(
|
||||
score=int(s.attributes["Score"].value),
|
||||
creation_date=datetime.strptime(basictime, "%Y-%m-%dT%H:%M:%S"),
|
||||
view_count=int(s.attributes["ViewCount"].value),
|
||||
body_length=len(s.attributes["Body"].value),
|
||||
answer_count=int(s.attributes["AnswerCount"].value),
|
||||
comment_count=int(s.attributes["CommentCount"].value),
|
||||
favourite_count=favourite_count,
|
||||
)
|
||||
rawdata.append(post.as_list)
|
||||
|
||||
data = np.array(rawdata)
|
||||
print(data.dtype)
|
||||
np.savetxt("data.txt", data,fmt="%d")
|
Reference in a new issue