initial version

2024-09-19 16:03:50 +02:00 · 2022-05-04 13:42:57 +02:00 · 2022-05-04 13:42:57 +02:00 · ef8046ff86
commit ef8046ff86
5 changed files with 255 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+.idea/
+*.png
+*.csv
--- a/main.py
+++ b/main.py
@ -0,0 +1,139 @@
+from pathlib import Path
+
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from pandas import DataFrame
+
+from readfiles import read_file, read_halo_file
+from remap_particle_IDs import IDScaler
+
+REFERENCE_RESOLUTION = 128
+COMPARISON_RESOLUTION = 512
+PLOT = True
+SINGLE = False
+
+reference_dir = Path(f"/home/lukas/monofonic_tests/shannon_{REFERENCE_RESOLUTION}_100")
+comparison_dir = Path(f"/home/lukas/monofonic_tests/shannon_{COMPARISON_RESOLUTION}_100/")
+
+
+# comparison_dir = Path(f"/home/lukas/monofonic_tests/DB8_{COMPARISON_RESOLUTION}_100/")
+
+
+
+ref_masses = []
+comp_masses = []
+ref_sizes = []
+comp_sizes = []
+
+print("reading reference file")
+df_ref = read_file(reference_dir)
+df_ref_halo = read_halo_file(reference_dir)
+
+print("reading comparison file")
+df_comp = read_file(comparison_dir)
+df_comp_halo = read_halo_file(comparison_dir)
+
+bytes_used = df_ref.memory_usage(index=True).sum()
+print(f"Memory: {bytes_used / 1024 / 1024:.2f} MB")
+print(df_ref.dtypes)
+
+for index, original_halo in df_ref_halo[:5].iterrows():
+    print(index)
+    print(len(df_ref))
+    particles_in_ref_halo = df_ref.loc[df_ref["FOFGroupIDs"] == index]
+    ref_halo = df_ref_halo.loc[index]
+    halo_particle_ids = set(particles_in_ref_halo.index.to_list())
+
+    if REFERENCE_RESOLUTION < COMPARISON_RESOLUTION:
+        print("upscaling IDs")
+        upscaled_ids = set()
+        prev_len = len(halo_particle_ids)
+        print(prev_len)
+        scaler = IDScaler(REFERENCE_RESOLUTION, COMPARISON_RESOLUTION)
+        # i = 0
+        for id in halo_particle_ids:
+            # i += 1
+            # if i % 1000 == 0:
+            #     print(i)
+            upscaled_ids.update(set(scaler.upscale(id)))
+        halo_particle_ids = upscaled_ids
+        after_len = len(upscaled_ids)
+        print(after_len)
+        print(after_len / prev_len)
+        print("done")
+    if COMPARISON_RESOLUTION < REFERENCE_RESOLUTION:
+        print("downscaling IDs")
+        prev_count = len(halo_particle_ids)
+        print(prev_count)
+        downscaled_ids = set()
+        scaler = IDScaler(COMPARISON_RESOLUTION, REFERENCE_RESOLUTION)
+        for id in halo_particle_ids:
+            downscaled_ids.add(scaler.downscale(id))
+        halo_particle_ids = downscaled_ids
+        print("done")
+        after_count = len(halo_particle_ids)
+        print(after_count)
+        print(prev_count / after_count)
+
+    particles = df_comp.loc[list(halo_particle_ids)]
+    # print(particles)
+
+    halos_in_particles = set(particles["FOFGroupIDs"])
+    halos_in_particles.discard(2147483647)
+    # print(halos_in_particles)
+    if PLOT:
+        fig: Figure = plt.figure()
+        ax: Axes = fig.gca()
+        ax.scatter(particles["X"], particles["Y"], s=1, alpha=.3, label="Halo")
+    #     ax.scatter(particles_in_ref_halo["X"], particles_in_ref_halo["Y"], s=1, alpha=.3, label="RefHalo")
+    # plt.legend()
+    # plt.show()
+    best_halo = None
+    best_halo_match = 0
+
+    for halo in halos_in_particles:
+        # print("----------", halo, "----------")
+        print(halo)
+        halo_data = df_comp_halo.loc[halo]
+        particles_in_comp_halo: DataFrame = df_comp.loc[df_comp["FOFGroupIDs"] == halo]
+        halo_size = len(particles_in_comp_halo)
+
+        df = particles_in_comp_halo.join(particles, how="inner", rsuffix="ref")
+        shared_size = len(df)
+        match = shared_size / halo_size
+        # print(match, halo_size, shared_size)
+        # print(df)
+        if PLOT:
+            ax.scatter(df["X"], df["Y"], s=1, alpha=.3, label=f"shared {halo}")
+        # ax.scatter(particles_in_comp_halo["X"], particles_in_comp_halo["Y"], s=2, alpha=.3, label=f"shared {halo}")
+        if shared_size > best_halo_match:
+            best_halo_match = shared_size
+            best_halo = halo
+
+    # print("-------")
+    # print(best_halo)
+    comp_halo = df_comp_halo.loc[best_halo]
+
+    print(ref_halo)
+    print(comp_halo)
+    ref_sizes.append(ref_halo["Sizes"])
+    ref_masses.append(ref_halo["Masses"])
+    comp_sizes.append(comp_halo["Sizes"])
+    comp_masses.append(comp_halo["Masses"])
+    # exit()
+    if PLOT:
+        ax.legend()
+        ax.set_title(f"{reference_dir.name} vs. {comparison_dir.name} (Halo {index})")
+        fig.savefig("out.png", dpi=300)
+        plt.show()
+    if SINGLE:
+        break
+
+df = DataFrame(np.array([ref_sizes, comp_sizes, ref_masses, comp_masses]).T,
+               columns=["ref_sizes", "comp_sizes", "ref_masses", "comp_masses"])
+print(df)
+df.to_csv("sizes.csv", index=False)
--- a/readfiles.py
+++ b/readfiles.py
@ -0,0 +1,41 @@
+from pathlib import Path
+
+import h5py
+import pandas as pd
+from pandas import DataFrame
+
+
+def read_file(path: Path) -> pd.DataFrame:
+    cache_file = path / "cache"
+    if not cache_file.exists():
+        file = path / "output_0004.hdf5"
+        reference_file = h5py.File(file)
+        df = pd.DataFrame(reference_file["PartType1"]["Coordinates"], columns=["X", "Y", "Z"])
+        df2 = pd.DataFrame(reference_file["PartType1"]["FOFGroupIDs"], columns=["FOFGroupIDs"]).astype("category")
+        df = df.merge(df2, "outer", left_index=True, right_index=True)
+        del df2
+        df3 = pd.DataFrame(reference_file["PartType1"]["ParticleIDs"], columns=["ParticleIDs"])
+
+        df = df.merge(df3, "outer", left_index=True, right_index=True)
+        del df3
+        df.set_index("ParticleIDs", inplace=True)
+        print("saving cache")
+        df.to_pickle(str(cache_file))
+        return df
+    print("from cache")
+    df = pd.read_pickle(str(cache_file))
+    return df
+
+
+def read_halo_file(path: Path) -> DataFrame:
+    file = path / "fof_output_0004.hdf5"
+    reference_file = h5py.File(file)
+    df1 = pd.DataFrame(reference_file["Groups"]["Centres"], columns=["X", "Y", "Z"])
+    df2 = pd.DataFrame(reference_file["Groups"]["GroupIDs"], columns=["GroupIDs"])
+    df3 = pd.DataFrame(reference_file["Groups"]["Masses"], columns=["Masses"])
+    df4 = pd.DataFrame(reference_file["Groups"]["Sizes"], columns=["Sizes"])
+    df = df1.merge(df2, "outer", left_index=True, right_index=True)
+    df = df.merge(df3, "outer", left_index=True, right_index=True)
+    df = df.merge(df4, "outer", left_index=True, right_index=True)
+    df.set_index("GroupIDs", inplace=True)
+    return df
--- a/remap_particle_IDs.py
+++ b/remap_particle_IDs.py
@ -0,0 +1,53 @@
+from itertools import product
+
+import numpy as np
+
+
+class IDScaler:
+    def __init__(self, Nres_min: int, Nres_max: int):
+        assert Nres_max % Nres_min == 0
+        self.Nres_min = Nres_min
+        self.Nres_max = Nres_max
+        self.N = Nres_max // Nres_min
+        self.shifts = []
+        for shift in product(range(self.N), repeat=3):
+            self.shifts.append(np.array(shift))
+
+    @staticmethod
+    def original_position(Nres: int, particle_ID: int):
+        particle_k = particle_ID % Nres
+        particle_j = ((particle_ID - particle_k) // Nres) % Nres
+        particle_i = ((particle_ID - particle_k) // Nres - particle_j) // Nres
+        return np.array([particle_i, particle_j, particle_k])
+
+    def upscale(self, particle_ID: int):
+        orig = self.original_position(self.Nres_min, particle_ID)
+        mult = orig * self.N
+        for shift in self.shifts:
+            variant = mult + shift
+            yield ((variant[0] * self.Nres_max) + variant[1]) * self.Nres_max + variant[2]
+
+    def downscale(self, particle_ID: int):
+        orig = self.original_position(self.Nres_max, particle_ID)
+        mult = np.floor_divide(orig, self.N)
+        return ((mult[0] * self.Nres_min) + mult[1]) * self.Nres_min + mult[2]
+
+
+if __name__ == "__main__":
+    test_particle = np.array([0, 0, 127])
+    # maximum_test = np.array([127, 127, 127]) #this works, Nres - 1 is the maximum for (i,j,k)
+
+    Nres_1 = 128
+    Nres_2 = 256
+
+    test_particle_id = ((test_particle[0] * Nres_1) + test_particle[1]) * Nres_1 + test_particle[2]
+    print(test_particle_id)
+
+    scaler = IDScaler(Nres_1, Nres_2)
+
+    particle_ID_1_converted = scaler.upscale(test_particle_id)
+
+    for id in particle_ID_1_converted:
+        reverse = scaler.downscale(id)
+        print(id, reverse)
+        assert reverse == test_particle_id
--- a/sizes.py
+++ b/sizes.py
@ -0,0 +1,19 @@
+import pandas as pd
+from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+df = pd.read_csv("sizes.csv")
+
+print(df)
+
+df = df.iloc[:50]
+
+fig: Figure = plt.figure()
+ax: Axes = fig.gca()
+# ax.scatter(df["ref_sizes"], df["comp_sizes"], s=1, alpha=.3)
+ax.scatter(df["ref_masses"], df["comp_masses"], s=3)
+
+ax.set_xscale("log")
+ax.set_yscale("log")
+plt.show()