halo_comparison/read_vr_files.py

from pathlib import Path
from typing import Dict, Tuple, Optional

import h5py
import numpy as np
import pandas as pd
from h5py import Dataset
from matplotlib import pyplot as plt
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from pandas import DataFrame

from halo_mass_profile import V
from paths import base_dir
from readfiles import read_file
from utils import print_progress

HaloParticleMapping = Dict[int, set[int]]


def all_children(df: DataFrame, id: int):
    subhalos: pd.DataFrame = df.loc[df.parent_halo_id == id]

    if len(subhalos) == 0:
        yield id

    for sh, value in subhalos.iterrows():
        yield from all_children(df, sh)


def particles_in_halo(
    offsets: Dict[int, int], particle_ids: np.ndarray
) -> HaloParticleMapping:
    """
    get mapping from halo ID to particle ID set by using the offset and a lookup in the particle array
    """
    halo_particle_ids: Dict[int, set[int]] = {}
    pointer = 0
    for halo_id, offset in offsets.items():
        if halo_id % 100 == 0:
            print_progress(halo_id, len(offsets))
        if halo_id != len(offsets):
            end = offsets[halo_id + 1]
        else:  # special handling for last halo
            end = -1
        ids = particle_ids[pointer:end]
        halo_particle_ids[halo_id] = set(ids)
        pointer = end
    return halo_particle_ids
    # if recursivly:
    #     # add particles of subhalos to parent halos
    #     # maybe not such a good idea
    #     for halo_id in range(1, len(df) + 1):
    #         sub_n_children = list(all_children(df, halo_id))  # IDs of children, subchildren, ...
    #         if not sub_n_children:
    #             continue
    #         for child_id in sub_n_children:
    #             child_particles = halo_particle_ids[child_id]
    #             halo_particle_ids[halo_id].update(child_particles)


def cached_particles_in_halo(file: Path, *args, **kwargs) -> HaloParticleMapping:
    """
    Save mapping from halo ID to set of particle IDs into HDF5 file.
    Every halo is a dataset with its ID as the name and the list of particles as value.

    Unfortunatly this is magnitudes slower than doing the calculation itself as HDF5 is not
    intended for 100K small datasets making this whole function pointless.

    """
    if file.exists():
        print("loading from cache")
        with h5py.File(file) as data_file:
            halo_particle_ids: HaloParticleMapping = {}
            i = 0
            for name, dataset in data_file.items():
                halo_particle_ids[int(name)] = set(dataset)
                print_progress(i, len(data_file))
                i += 1
    else:
        halo_particle_ids = particles_in_halo(*args, **kwargs)
        print("saving to cache")
        with h5py.File(file, "w") as data_file:
            for key, valueset in halo_particle_ids.items():
                data_file.create_dataset(
                    str(key),
                    data=list(valueset),
                    compression="gzip",
                    compression_opts=5,
                )
    return halo_particle_ids


def read_velo_halos(directory: Path, veloname="vroutput"):
    """
    Returns a dataframe containing all scalar properties of the halos
    (https://velociraptor-stf.readthedocs.io/en/latest/output.html),
    """
    if (directory / f"{veloname}.catalog_groups.0").exists():
        suffix = ".0"
    else:
        suffix = ""
    group_catalog = h5py.File(directory / f"{veloname}.catalog_groups{suffix}")
    group_properties = h5py.File(directory / f"{veloname}.properties{suffix}")
    scalar_properties = {}
    for k, v in group_properties.items():
        if not isinstance(v, Dataset):
            # skip groups
            continue
        if len(v.shape) != 1:
            print(v)
            continue
        if len(v) == 1:
            # skip global properties like Total_num_of_groups
            continue
        scalar_properties[k] = v
    scalar_properties["X"] = scalar_properties["Xc"]
    scalar_properties["Y"] = scalar_properties["Yc"]
    scalar_properties["Z"] = scalar_properties["Zc"]
    data = {
        "group_size": group_catalog["Group_Size"],
        "offset": group_catalog["Offset"],
        "offset_unbound": group_catalog["Offset_unbound"],
        "parent_halo_id": group_catalog["Parent_halo_ID"],
    }
    df = pd.DataFrame(
        {**data, **scalar_properties}
    )  # create dataframe from two merged dicts
    df.index += 1  # Halo IDs start at 1
    return df


def read_velo_halo_particles(
    directory: Path, skip_halo_particle_ids=False, skip_unbound=True
) -> Tuple[DataFrame, Optional[HaloParticleMapping], Optional[HaloParticleMapping]]:
    """
    This reads the output files of VELOCIraptor
    and returns the halo data from read_velo_halos
    and two dictionaries mapping the halo IDs to sets of particle IDs
    """
    if (directory / f"vroutput.catalog_particles.0").exists():
        suffix = ".0"
    else:
        suffix = ""
    df = read_velo_halos(directory)
    particle_catalog = h5py.File(directory / f"vroutput.catalog_particles{suffix}")
    particle_ids = np.asarray(particle_catalog["Particle_IDs"])

    particle_catalog_unbound = h5py.File(
        directory / f"vroutput.catalog_particles.unbound{suffix}"
    )
    particle_ids_unbound = particle_catalog_unbound["Particle_IDs"][:]
    if skip_halo_particle_ids:
        return df, None, None
    print("look up bound particle IDs")
    # particle_cache_file = directory / "vrbound.cache.hdf5"
    # ub_particle_cache_file = directory / "vrunbound.cache.hdf5"
    halo_offsets = dict(df["offset"])
    halo_particle_ids = particles_in_halo(halo_offsets, particle_ids)
    if skip_unbound:
        halo_particle_unbound_ids = {}
    else:
        print("look up unbound particle IDs")
        halo_unbound_offsets = dict(df["offset_unbound"])
        halo_particle_unbound_ids = particles_in_halo(
            halo_unbound_offsets, particle_ids_unbound
        )
    return df, halo_particle_ids, halo_particle_unbound_ids


def read_velo_profiles(directory: Path):
    if (directory / f"vroutput.profiles.0").exists():
        suffix = ".0"
    else:
        suffix = ""
    profiles_file = h5py.File(directory / f"vroutput.profiles{suffix}")
    bin_edges = profiles_file["Radial_bin_edges"][1:]

    bin_volumes = V(bin_edges)

    mass_profiles = profiles_file["Mass_profile"][:]

    density_profiles = mass_profiles / bin_volumes

    return bin_edges, mass_profiles, density_profiles


def main():
    waveform = "shannon"
    Nres = 512
    directory = base_dir / f"{waveform}_{Nres}_100"

    df_halo, halo_particle_ids, halo_particle_unbound_ids = read_velo_halo_particles(
        directory
    )
    particles, meta = read_file(directory)
    HALO = 1000
    while True:
        fig: Figure = plt.figure()
        ax: Axes = fig.gca()

        bound_particles = particles.loc[list(halo_particle_ids[HALO])]
        unbound_particles = particles.loc[list(halo_particle_unbound_ids[HALO])]

        ax.scatter(bound_particles.X, bound_particles.Y, label="bound", s=1)
        ax.scatter(unbound_particles.X, unbound_particles.Y, label="unbound", s=1)

        plt.show()
        HALO += 1


if __name__ == "__main__":
    main()
Testing vr read in file 2022-05-09 12:02:05 +02:00			`from pathlib import Path`
better halo_vis 2022-07-14 15:14:26 +02:00			`from typing import Dict, Tuple, Optional`
Testing vr read in file 2022-05-09 12:02:05 +02:00
			`import h5py`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`import numpy as np`
Testing vr read in file 2022-05-09 12:02:05 +02:00			`import pandas as pd`
majorly improve comparisons 2022-05-24 17:06:49 +02:00			`from h5py import Dataset`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`from matplotlib import pyplot as plt`
			`from matplotlib.axes import Axes`
			`from matplotlib.figure import Figure`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`from pandas import DataFrame`
Testing vr read in file 2022-05-09 12:02:05 +02:00
add vr profile reader 2022-07-12 10:19:24 +02:00			`from halo_mass_profile import V`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`from paths import base_dir`
Testing vr read in file 2022-05-09 12:02:05 +02:00			`from readfiles import read_file`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`from utils import print_progress`
Testing vr read in file 2022-05-09 12:02:05 +02:00
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`HaloParticleMapping = Dict[int, set[int]]`
Testing vr read in file 2022-05-09 12:02:05 +02:00
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00
			`def all_children(df: DataFrame, id: int):`
Read vr files might work, now exportable as a function 2022-05-09 15:08:28 +02:00			`subhalos: pd.DataFrame = df.loc[df.parent_halo_id == id]`
Testing vr read in file 2022-05-09 12:02:05 +02:00
Read vr files might work, now exportable as a function 2022-05-09 15:08:28 +02:00			`if len(subhalos) == 0:`
			`yield id`
Testing vr read in file 2022-05-09 12:02:05 +02:00
Read vr files might work, now exportable as a function 2022-05-09 15:08:28 +02:00			`for sh, value in subhalos.iterrows():`
			`yield from all_children(df, sh)`
Testing vr read in file 2022-05-09 12:02:05 +02:00

Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`def particles_in_halo(`
			`offsets: Dict[int, int], particle_ids: np.ndarray`
			`) -> HaloParticleMapping:`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`"""`
			`get mapping from halo ID to particle ID set by using the offset and a lookup in the particle array`
			`"""`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`halo_particle_ids: Dict[int, set[int]] = {}`
			`pointer = 0`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`for halo_id, offset in offsets.items():`
			`if halo_id % 100 == 0:`
			`print_progress(halo_id, len(offsets))`
			`if halo_id != len(offsets):`
			`end = offsets[halo_id + 1]`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`else: # special handling for last halo`
			`end = -1`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`ids = particle_ids[pointer:end]`
			`halo_particle_ids[halo_id] = set(ids)`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`pointer = end`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`return halo_particle_ids`
			`# if recursivly:`
			`# # add particles of subhalos to parent halos`
			`# # maybe not such a good idea`
			`# for halo_id in range(1, len(df) + 1):`
			`# sub_n_children = list(all_children(df, halo_id)) # IDs of children, subchildren, ...`
			`# if not sub_n_children:`
			`# continue`
			`# for child_id in sub_n_children:`
			`# child_particles = halo_particle_ids[child_id]`
			`# halo_particle_ids[halo_id].update(child_particles)`


			`def cached_particles_in_halo(file: Path, args, *kwargs) -> HaloParticleMapping:`
			`"""`
			`Save mapping from halo ID to set of particle IDs into HDF5 file.`
			`Every halo is a dataset with its ID as the name and the list of particles as value.`

			`Unfortunatly this is magnitudes slower than doing the calculation itself as HDF5 is not`
			`intended for 100K small datasets making this whole function pointless.`
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`"""`
			`if file.exists():`
			`print("loading from cache")`
			`with h5py.File(file) as data_file:`
			`halo_particle_ids: HaloParticleMapping = {}`
			`i = 0`
			`for name, dataset in data_file.items():`
			`halo_particle_ids[int(name)] = set(dataset)`
			`print_progress(i, len(data_file))`
			`i += 1`
			`else:`
			`halo_particle_ids = particles_in_halo(args, *kwargs)`
			`print("saving to cache")`
			`with h5py.File(file, "w") as data_file:`
			`for key, valueset in halo_particle_ids.items():`
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`data_file.create_dataset(`
			`str(key),`
			`data=list(valueset),`
			`compression="gzip",`
			`compression_opts=5,`
			`)`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`return halo_particle_ids`


support vr .0 files 2022-07-08 13:23:56 +02:00			`def read_velo_halos(directory: Path, veloname="vroutput"):`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`"""`
			`Returns a dataframe containing all scalar properties of the halos`
			`(https://velociraptor-stf.readthedocs.io/en/latest/output.html),`
			`"""`
support vr .0 files 2022-07-08 13:23:56 +02:00			`if (directory / f"{veloname}.catalog_groups.0").exists():`
			`suffix = ".0"`
			`else:`
			`suffix = ""`
			`group_catalog = h5py.File(directory / f"{veloname}.catalog_groups{suffix}")`
			`group_properties = h5py.File(directory / f"{veloname}.properties{suffix}")`
majorly improve comparisons 2022-05-24 17:06:49 +02:00			`scalar_properties = {}`
			`for k, v in group_properties.items():`
			`if not isinstance(v, Dataset):`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`# skip groups`
majorly improve comparisons 2022-05-24 17:06:49 +02:00			`continue`
			`if len(v.shape) != 1:`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`print(v)`
majorly improve comparisons 2022-05-24 17:06:49 +02:00			`continue`
			`if len(v) == 1:`
			`# skip global properties like Total_num_of_groups`
			`continue`
			`scalar_properties[k] = v`
			`scalar_properties["X"] = scalar_properties["Xc"]`
			`scalar_properties["Y"] = scalar_properties["Yc"]`
			`scalar_properties["Z"] = scalar_properties["Zc"]`
			`data = {`
			`"group_size": group_catalog["Group_Size"],`
			`"offset": group_catalog["Offset"],`
			`"offset_unbound": group_catalog["Offset_unbound"],`
			`"parent_halo_id": group_catalog["Parent_halo_ID"],`
			`}`
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`df = pd.DataFrame(`
			`{data, scalar_properties}`
			`) # create dataframe from two merged dicts`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`df.index += 1 # Halo IDs start at 1`
some minor changes 2022-06-01 11:31:42 +02:00			`return df`

Testing vr read in file 2022-05-09 12:02:05 +02:00
some minor changes 2022-06-01 11:31:42 +02:00			`def read_velo_halo_particles(`
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`directory: Path, skip_halo_particle_ids=False, skip_unbound=True`
better halo_vis 2022-07-14 15:14:26 +02:00			`) -> Tuple[DataFrame, Optional[HaloParticleMapping], Optional[HaloParticleMapping]]:`
some minor changes 2022-06-01 11:31:42 +02:00			`"""`
			`This reads the output files of VELOCIraptor`
			`and returns the halo data from read_velo_halos`
			`and two dictionaries mapping the halo IDs to sets of particle IDs`
			`"""`
support vr .0 files 2022-07-08 13:23:56 +02:00			`if (directory / f"vroutput.catalog_particles.0").exists():`
			`suffix = ".0"`
			`else:`
			`suffix = ""`
some minor changes 2022-06-01 11:31:42 +02:00			`df = read_velo_halos(directory)`
support vr .0 files 2022-07-08 13:23:56 +02:00			`particle_catalog = h5py.File(directory / f"vroutput.catalog_particles{suffix}")`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`particle_ids = np.asarray(particle_catalog["Particle_IDs"])`
Testing vr read in file 2022-05-09 12:02:05 +02:00
Read vr files might work, now exportable as a function 2022-05-09 15:08:28 +02:00			`particle_catalog_unbound = h5py.File(`
support vr .0 files 2022-07-08 13:23:56 +02:00			`directory / f"vroutput.catalog_particles.unbound{suffix}"`
Read vr files might work, now exportable as a function 2022-05-09 15:08:28 +02:00			`)`
			`particle_ids_unbound = particle_catalog_unbound["Particle_IDs"][:]`
better halo_vis 2022-07-14 15:14:26 +02:00			`if skip_halo_particle_ids:`
			`return df, None, None`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`print("look up bound particle IDs")`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`# particle_cache_file = directory / "vrbound.cache.hdf5"`
			`# ub_particle_cache_file = directory / "vrunbound.cache.hdf5"`
			`halo_offsets = dict(df["offset"])`
			`halo_particle_ids = particles_in_halo(halo_offsets, particle_ids)`
kind of working velo halos 2022-05-11 14:22:34 +02:00			`if skip_unbound:`
			`halo_particle_unbound_ids = {}`
			`else:`
			`print("look up unbound particle IDs")`
clean up a lot of code and better halo visualisation 2022-05-25 17:17:24 +02:00			`halo_unbound_offsets = dict(df["offset_unbound"])`
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`halo_particle_unbound_ids = particles_in_halo(`
			`halo_unbound_offsets, particle_ids_unbound`
			`)`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`return df, halo_particle_ids, halo_particle_unbound_ids`
Testing vr read in file 2022-05-09 12:02:05 +02:00

add vr profile reader 2022-07-12 10:19:24 +02:00			`def read_velo_profiles(directory: Path):`
			`if (directory / f"vroutput.profiles.0").exists():`
			`suffix = ".0"`
			`else:`
			`suffix = ""`
			`profiles_file = h5py.File(directory / f"vroutput.profiles{suffix}")`
			`bin_edges = profiles_file["Radial_bin_edges"][1:]`

			`bin_volumes = V(bin_edges)`

			`mass_profiles = profiles_file["Mass_profile"][:]`

			`density_profiles = mass_profiles / bin_volumes`

			`return bin_edges, mass_profiles, density_profiles`


more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`def main():`
			`waveform = "shannon"`
kind of working velo halos 2022-05-11 14:22:34 +02:00			`Nres = 512`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`directory = base_dir / f"{waveform}_{Nres}_100"`
Testing vr read in file 2022-05-09 12:02:05 +02:00
Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`df_halo, halo_particle_ids, halo_particle_unbound_ids = read_velo_halo_particles(`
			`directory`
			`)`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`particles, meta = read_file(directory)`
			`HALO = 1000`
			`while True:`
			`fig: Figure = plt.figure()`
			`ax: Axes = fig.gca()`
Testing vr read in file 2022-05-09 12:02:05 +02:00
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`bound_particles = particles.loc[list(halo_particle_ids[HALO])]`
			`unbound_particles = particles.loc[list(halo_particle_unbound_ids[HALO])]`

			`ax.scatter(bound_particles.X, bound_particles.Y, label="bound", s=1)`
			`ax.scatter(unbound_particles.X, unbound_particles.Y, label="unbound", s=1)`

			`plt.show()`
			`HALO += 1`


Changed layout of comparison figures: introduced sup(x/y)label but kept old structure, changed size of rowcolumn_labels to fit other labels, changed position of comp information Formatted everything with black 2022-08-10 16:26:30 +02:00			`if __name__ == "__main__":`
more 3d and properly reading velociraptor subhalos 2022-05-10 13:29:20 +02:00			`main()`