Merged ohahn/monofonic into master

2024-09-18 15:53:45 +02:00 · 2024-03-12 16:01:45 -05:00 · 2024-03-12 16:01:45 -05:00 · 5dc3d39604
commit 5dc3d39604
parent 091c9dee75 c56136a8c3
33 changed files with 3065 additions and 160 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -183,19 +183,42 @@ list (APPEND SOURCES
 endif()
 endif()

+########################################################################################################################
+# DOXYGEN
+# This will generate Doxygen documentation when you run make doc_doxygen in your build directory.
+option(BUILD_DOCUMENTATION "Build doxygen documentation" OFF)
+if(BUILD_DOCUMENTATION)
+    find_package(Doxygen)
+    if(DOXYGEN_FOUND)
+        set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)
+        set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
+
+        configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY)
+
+        add_custom_target( doc_doxygen ALL
+            COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+            COMMENT "Generating API documentation with Doxygen"
+            VERBATIM )
+    endif(DOXYGEN_FOUND)
+endif(BUILD_DOCUMENTATION)
+
+########################################################################################################################
 # project configuration header
 configure_file(
  ${PROJECT_SOURCE_DIR}/include/cmake_config.hh.in
  ${PROJECT_SOURCE_DIR}/include/cmake_config.hh
 )

+########################################################################################################################
+# executable and linking
 add_executable(${PRGNAME} ${SOURCES} ${PLUGINS})
 # target_setup_class(${PRGNAME})

 set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 14)

-
-# mpi flags
+########################################################################################################################
+# mpi flags and precision set up
 if(MPI_CXX_FOUND)
  if(CODE_PRECISION STREQUAL "FLOAT")
    if(FFTW3_SINGLE_MPI_FOUND)
--- a/Doxyfile.in
+++ b/Doxyfile.in
--- a/example.conf
+++ b/example.conf
@ -20,7 +20,11 @@ DoFixing        = no      # do mode fixing à la Angulo&Pontzen (https://arxiv.o
 DoInversion     = no       # invert phases (for paired simulations)

 ParticleLoad    = sc       # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x) 
-                           # (increases number of particles by given factor!), or 'glass'
+                           # (increases number of particles by given factor!), 
+                           # or 'glass' or 'masked'
+
+## if `ParticleLoad = masked' then you can specify here how masking should take place
+# ParticleMaskType = 3     # bit mask for particle mask (0=center,1=center+edges,2=center+faces,3=center+edges+faces)

 ## if `ParticleLoad = glass' then specify here where to load the glass distribution from
 # GlassFileName   = glass128.hdf5
--- a/external/class.cmake
+++ b/external/class.cmake
@ -3,7 +3,7 @@ include(FetchContent)
 FetchContent_Declare(
    class
    GIT_REPOSITORY https://github.com/ohahn/class_public.git
-    GIT_TAG master
+    GIT_TAG monofonic_v1
    GIT_SHALLOW YES
    GIT_PROGRESS TRUE
    USES_TERMINAL_DOWNLOAD TRUE   # <---- this is needed only for Ninja
--- a/external/panphasia_ho/uniform_rand_threefry4x64.c
+++ b/external/panphasia_ho/uniform_rand_threefry4x64.c
@ -100,8 +100,8 @@ void threefry4x64_test_(int verbose)
    if ((rand.v[0] != result.v[0]) || (rand.v[1] != result.v[1]) || (rand.v[2] != result.v[2]) || (rand.v[3] != result.v[3]))
    {
      printf("Serious error occured !!!!!!!!!!  Random generator is not working correctly \n");
-      printf("Random generated: %lu %lu %lu %lu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
-      printf("Random expected:  %lu %lu %lu %lu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
+      printf("Random generated: %llu %llu %llu %llu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
+      printf("Random expected:  %llu %llu %llu %llu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
      //abort();
    }
    else
@ -121,8 +121,8 @@ void threefry4x64_test_(int verbose)
    if ((rand.v[0] != result.v[0]) || (rand.v[1] != result.v[1]) || (rand.v[2] != result.v[2]) || (rand.v[3] != result.v[3]))
    {
      printf("Serious error occured !!!!!!!!!!  Random generator is not working correctly \n");
-      printf("Random generated: %lu %lu %lu %lu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
-      printf("Random expected:  %lu %lu %lu %lu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
+      printf("Random generated: %llu %llu %llu %llu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
+      printf("Random expected:  %llu %llu %llu %llu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
      //abort();
    }
    else
@ -143,8 +143,8 @@ void threefry4x64_test_(int verbose)
    if ((rand.v[0] != result.v[0]) || (rand.v[1] != result.v[1]) || (rand.v[2] != result.v[2]) || (rand.v[3] != result.v[3]))
    {
      printf("Serious error occured !!!!!!!!!!  Random generator is not working correctly \n");
-      printf("Random generated: %lu %lu %lu %lu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
-      printf("Random expected:  %lu %lu %lu %lu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
+      printf("Random generated: %llu %llu %llu %llu\n", rand.v[0], rand.v[1], rand.v[2], rand.v[3]);
+      printf("Random expected:  %llu %llu %llu %llu\n", result.v[0], result.v[1], result.v[2], result.v[3]);
      //abort();
    }
    else
@ -176,7 +176,7 @@ void set_panphasia_key_(int verbose)
  verbose = 0; //ARJ

  if (verbose)
-    printf("Setting the threefry4x64 key to\n(%0lu %0lu %0lu %0lu)\n\n",
+    printf("Setting the threefry4x64 key to\n(%0llu %0llu %0llu %0llu)\n\n",
           panphasia_key.v[0], panphasia_key.v[1], panphasia_key.v[2], panphasia_key.v[3]);
  panphasia_key_initialised = 999;

@ -237,10 +237,10 @@ void check_panphasia_key_(int verbose)
  if (panphasia_check_key.v[0] != panphasia_key.v[0] || panphasia_check_key.v[1] != panphasia_key.v[1] || panphasia_check_key.v[2] != panphasia_key.v[2] || panphasia_check_key.v[2] != panphasia_key.v[2])
  {
    printf("A serious error has happened - the threefry4x64 key has become corrupted!\n");
-    printf("Should be:  (%0lu %0lu %0lu %0lu)\n", panphasia_check_key.v[0],
+    printf("Should be:  (%0llu %0llu %0llu %0llu)\n", panphasia_check_key.v[0],
           panphasia_check_key.v[1], panphasia_check_key.v[2], panphasia_check_key.v[3]);

-    printf("But now is: (%0lu %0lu %0lu %0lu)\n", panphasia_key.v[0],
+    printf("But now is: (%0llu %0llu %0llu %0llu)\n", panphasia_key.v[0],
           panphasia_key.v[1], panphasia_key.v[2], panphasia_key.v[3]);
    printf("The fact that it has changed suggests the key has been overwritten in memory.\n");
    abort();
--- a/include/HDF_IO.hh
+++ b/include/HDF_IO.hh
@ -226,7 +226,7 @@ template<typename T >
 inline void HDFReadSelect( const std::string Filename, const std::string ObjName, const std::vector<unsigned>& ii, std::vector<T> &Data ){

  hid_t HDF_Type, HDF_FileID, HDF_DatasetID, HDF_DataspaceID, HDF_MemspaceID;
-  hsize_t HDF_StorageSize;
+//  hsize_t HDF_StorageSize;

  HDF_Type = GetDataType<T>();

--- a/include/convolution.hh
+++ b/include/convolution.hh
@ -22,6 +22,9 @@
 #include <general.hh>
 #include <grid_fft.hh>

+/// @brief base class for convolutions of two or three fields
+/// @tparam data_t 
+/// @tparam derived_t 
 template <typename data_t, typename derived_t>
 class BaseConvolver
 {
@ -30,23 +33,44 @@ protected:
    std::array<real_t, 3> length_;

 public:
+
+    /// @brief Construct a new Base Convolver object
+    /// @param N linear grid size
+    /// @param L physical box size
    BaseConvolver(const std::array<size_t, 3> &N, const std::array<real_t, 3> &L)
        : np_(N), length_(L) {}

+
+    /// @brief Construct a new Base Convolver object [deleted copy constructor]  
    BaseConvolver( const BaseConvolver& ) = delete;
    
+    /// @brief destructor (virtual)
    virtual ~BaseConvolver() {}

-    // implements convolution of two Fourier-space fields
+    /// @brief implements convolution of two Fourier-space fields
+    /// @tparam kfunc1 field 1
+    /// @tparam kfunc2 field 2
+    /// @tparam opp output operator
    template <typename kfunc1, typename kfunc2, typename opp>
    void convolve2(kfunc1 kf1, kfunc2 kf2, opp op) {}

-    // implements convolution of three Fourier-space fields
+    /// @brief implements convolution of three Fourier-space fields
+    /// @tparam kfunc1 field 1
+    /// @tparam kfunc2 field 2
+    /// @tparam kfunc3 field 3
+    /// @tparam opp output operator
    template <typename kfunc1, typename kfunc2, typename kfunc3, typename opp>
    void convolve3(kfunc1 kf1, kfunc2 kf2, kfunc3 kf3, opp op) {}

 public:

+    /// @brief convolve two gradient fields in Fourier space a_{,i} * b_{,j}
+    /// @tparam opp output operator type
+    /// @param inl left input field a
+    /// @param d1l direction of first gradient (,i)
+    /// @param inr right input field b
+    /// @param d1r direction of second gradient (,j)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_Gradients(Grid_FFT<data_t> &inl, const std::array<int, 1> &d1l,
                            Grid_FFT<data_t> &inr, const std::array<int, 1> &d1r,
@ -55,19 +79,29 @@ public:
        // transform to FS in case fields are not
        inl.FourierTransformForward();
        inr.FourierTransformForward();
-        // perform convolution of Hessians
+        // perform convolution of two gradients
        static_cast<derived_t &>(*this).convolve2(
+            // first gradient
            [&inl,&d1l](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inl.gradient(d1l[0],{i,j,k});
                return grad1*inl.kelem(i, j, k);
            },
+            // second gradient
            [&inr,&d1r](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inr.gradient(d1r[0],{i,j,k});
                return grad1*inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }

+    /// @brief convolve a gradient and a Hessian field in Fourier space a_{,i} * b_{,jk}
+    /// @tparam opp output operator type
+    /// @param inl left input field a
+    /// @param d1l direction of gradient (,i)
+    /// @param inr right input field b
+    /// @param d2r directions of Hessian (,jk)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_Gradient_and_Hessian(Grid_FFT<data_t> &inl, const std::array<int, 1> &d1l,
                                       Grid_FFT<data_t> &inr, const std::array<int, 2> &d2r,
@ -76,19 +110,29 @@ public:
        // transform to FS in case fields are not
        inl.FourierTransformForward();
        inr.FourierTransformForward();
-        // perform convolution of Hessians
+        // perform convolution of gradient and Hessian
        static_cast<derived_t &>(*this).convolve2(
+            // gradient
            [&](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto kk = inl.template get_k<real_t>(i, j, k);
                return ccomplex_t(0.0, -kk[d1l[0]]) * inl.kelem(i, j, k);
            },
+            // Hessian
            [&](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto kk = inr.template get_k<real_t>(i, j, k);
                return -kk[d2r[0]] * kk[d2r[1]] * inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }

+    /// @brief convolve two Hessian fields in Fourier space a_{,ij} * b_{,kl}
+    /// @tparam opp output operator type
+    /// @param inl left input field a
+    /// @param d2l directions of first Hessian (,ij)
+    /// @param inr right input field b
+    /// @param d2r directions of second Hessian (,kl)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_Hessians(Grid_FFT<data_t> &inl, const std::array<int, 2> &d2l,
                           Grid_FFT<data_t> &inr, const std::array<int, 2> &d2r,
@ -99,19 +143,31 @@ public:
        inr.FourierTransformForward();
        // perform convolution of Hessians
        static_cast<derived_t &>(*this).convolve2(
+            // first Hessian
            [&inl,&d2l](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inl.gradient(d2l[0],{i,j,k});
                auto grad2 = inl.gradient(d2l[1],{i,j,k});
                return grad1*grad2*inl.kelem(i, j, k);
            },
+            // second Hessian
            [&inr,&d2r](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inr.gradient(d2r[0],{i,j,k});
                auto grad2 = inr.gradient(d2r[1],{i,j,k});
                return grad1*grad2*inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }

+    /// @brief convolve three Hessian fields in Fourier space a_{,ij} * b_{,kl} * c_{,mn}
+    /// @tparam opp output operator
+    /// @param inl first input field a
+    /// @param d2l directions of first Hessian (,ij)
+    /// @param inm second input field b
+    /// @param d2m directions of second Hessian (,kl)
+    /// @param inr third input field c
+    /// @param d2r directions of third Hessian (,mn)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_Hessians(Grid_FFT<data_t> &inl, const std::array<int, 2> &d2l,
                           Grid_FFT<data_t> &inm, const std::array<int, 2> &d2m,
@ -124,24 +180,36 @@ public:
        inr.FourierTransformForward();
        // perform convolution of Hessians
        static_cast<derived_t &>(*this).convolve3(
+            // first Hessian
            [&inl, &d2l](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inl.gradient(d2l[0],{i,j,k});
                auto grad2 = inl.gradient(d2l[1],{i,j,k});
                return grad1*grad2*inl.kelem(i, j, k);
            },
+            // second Hessian
            [&inm, &d2m](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inm.gradient(d2m[0],{i,j,k});
                auto grad2 = inm.gradient(d2m[1],{i,j,k});
                return grad1*grad2*inm.kelem(i, j, k);
            },
+            // third Hessian
            [&inr, &d2r](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inr.gradient(d2r[0],{i,j,k});
                auto grad2 = inr.gradient(d2r[1],{i,j,k});
                return grad1*grad2*inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }

+    /// @brief convolve Hessian field with sum of two Hessian fields in Fourier space a_{,ij} * (b_{,kl} + c_{,mn})
+    /// @tparam opp output operator type
+    /// @param inl left input field a
+    /// @param d2l directions of first Hessian (,ij)
+    /// @param inr right input field b
+    /// @param d2r1 directions of second Hessian (,kl)
+    /// @param d2r2 directions of third Hessian (,mn)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_SumOfHessians(Grid_FFT<data_t> &inl, const std::array<int, 2> &d2l,
                                Grid_FFT<data_t> &inr, const std::array<int, 2> &d2r1, const std::array<int, 2> &d2r2,
@ -152,11 +220,13 @@ public:
        inr.FourierTransformForward();
        // perform convolution of Hessians
        static_cast<derived_t &>(*this).convolve2(
+            // first Hessian
            [&inl, &d2l](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inl.gradient(d2l[0],{i,j,k});
                auto grad2 = inl.gradient(d2l[1],{i,j,k});
                return grad1*grad2*inl.kelem(i, j, k);
            },
+            // second two Hessian and sum
            [&inr, &d2r1, &d2r2](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad11 = inr.gradient(d2r1[0],{i,j,k});
                auto grad12 = inr.gradient(d2r1[1],{i,j,k});
@ -164,9 +234,18 @@ public:
                auto grad22 = inr.gradient(d2r2[1],{i,j,k});
                return (grad11*grad12+grad21*grad22)*inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }

+    /// @brief convolve Hessian field with difference of two Hessian fields in Fourier space a_{,ij} * (b_{,kl} - c_{,mn})
+    /// @tparam opp output operator type
+    /// @param inl left input field a
+    /// @param d2l directions of first Hessian (,ij)
+    /// @param inr right input field b
+    /// @param d2r1 directions of second Hessian (,kl)
+    /// @param d2r2 directions of third Hessian (,mn)
+    /// @param output_op output operator
    template <typename opp>
    void convolve_DifferenceOfHessians(Grid_FFT<data_t> &inl, const std::array<int, 2> &d2l,
                                       Grid_FFT<data_t> &inr, const std::array<int, 2> &d2r1, const std::array<int, 2> &d2r2,
@ -177,11 +256,13 @@ public:
        inr.FourierTransformForward();
        // perform convolution of Hessians
        static_cast<derived_t &>(*this).convolve2(
+            // first Hessian
            [&inl, &d2l](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad1 = inl.gradient(d2l[0],{i,j,k});
                auto grad2 = inl.gradient(d2l[1],{i,j,k});
                return grad1*grad2*inl.kelem(i, j, k);
            },
+            // second two Hessian and difference
            [&inr, &d2r1, &d2r2](size_t i, size_t j, size_t k) -> ccomplex_t {
                auto grad11 = inr.gradient(d2r1[0],{i,j,k});
                auto grad12 = inr.gradient(d2r1[1],{i,j,k});
@ -189,21 +270,29 @@ public:
                auto grad22 = inr.gradient(d2r2[1],{i,j,k});
                return (grad11*grad12-grad21*grad22)*inr.kelem(i, j, k);
            },
+            // -> output operator
            output_op);
    }
 };

-//! naive convolution class, disrespecting aliasing
+//! low-level implementation of convolutions -- naive convolution class, ignoring aliasing (no padding)
 template <typename data_t>
 class NaiveConvolver : public BaseConvolver<data_t, NaiveConvolver<data_t>>
 {
 protected:
+    /// @brief buffer for Fourier transformed fields
    Grid_FFT<data_t> *fbuf1_, *fbuf2_;

+    /// @brief number of points in each direction
    using BaseConvolver<data_t, NaiveConvolver<data_t>>::np_;
+
+    /// @brief length of each direction
    using BaseConvolver<data_t, NaiveConvolver<data_t>>::length_;

 public:
+    /// @brief constructor
+    /// @param N number of points in each direction
+    /// @param L length of each direction
    NaiveConvolver(const std::array<size_t, 3> &N, const std::array<real_t, 3> &L)
        : BaseConvolver<data_t, NaiveConvolver<data_t>>(N, L)
    {
@ -211,12 +300,14 @@ public:
        fbuf2_ = new Grid_FFT<data_t>(N, length_, true, kspace_id);
    }

+    /// @brief destructor
    ~NaiveConvolver()
    {
        delete fbuf1_;
        delete fbuf2_;
    }

+    /// @brief convolution of two fields
    template <typename kfunc1, typename kfunc2, typename opp>
    void convolve2(kfunc1 kf1, kfunc2 kf2, opp output_op)
    {
@ -249,6 +340,7 @@ public:

    }

+    /// @brief convolution of three fields
    template <typename kfunc1, typename kfunc2, typename kfunc3, typename opp>
    void convolve3(kfunc1 kf1, kfunc2 kf2, kfunc3 kf3, opp output_op)
    {
@ -292,7 +384,13 @@ public:
        }
    }

+//--------------------------------------------------------------------------------------------------------
 private:
+
+    /// @brief copy data into a grid
+    /// @tparam kfunc abstract function type generating data
+    /// @param kf abstract function generating data
+    /// @param g grid to copy data into
    template <typename kfunc>
    void copy_in(kfunc kf, Grid_FFT<data_t> &g)
    {
@ -310,23 +408,28 @@ private:
    }
 };

-//! convolution class, respecting Orszag's 3/2 rule
+//! convolution class, respecting Orszag's 3/2 rule (padding in Fourier space to avoid aliasing)
 template <typename data_t>
 class OrszagConvolver : public BaseConvolver<data_t, OrszagConvolver<data_t>>
 {
 private:
-    Grid_FFT<data_t> *f1p_, *f2p_;
-    Grid_FFT<data_t> *fbuf_;
+    /// @brief buffer for Fourier transformed fields
+    Grid_FFT<data_t> *f1p_, *f2p_, *fbuf_;

    using BaseConvolver<data_t, OrszagConvolver<data_t>>::np_;
    using BaseConvolver<data_t, OrszagConvolver<data_t>>::length_;

-    ccomplex_t *crecvbuf_;
-    real_t *recvbuf_;
-    size_t maxslicesz_;
-    std::vector<ptrdiff_t> offsets_, offsetsp_;
-    std::vector<size_t> sizes_, sizesp_;
+    ccomplex_t *crecvbuf_; //!< receive buffer for MPI (complex)
+    real_t *recvbuf_;     //!< receive buffer for MPI (real)
+    size_t maxslicesz_;  //!< maximum size of a slice
+    std::vector<ptrdiff_t> offsets_, offsetsp_; //!< offsets for MPI
+    std::vector<size_t> sizes_, sizesp_;       //!< sizes for MPI

+    /// @brief get task index for a given index
+    /// @param index index
+    /// @param offsets offsets
+    /// @param sizes sizes
+    /// @param ntasks number of tasks
    int get_task(ptrdiff_t index, const std::vector<ptrdiff_t> &offsets, const std::vector<size_t> &sizes, const int ntasks)
    {
        int itask = 0;
@ -336,6 +439,10 @@ private:
    }

 public:
+
+    /// @brief constructor
+    /// @param N grid size
+    /// @param L grid length
    OrszagConvolver(const std::array<size_t, 3> &N, const std::array<real_t, 3> &L)
        : BaseConvolver<data_t, OrszagConvolver<data_t>>({3 * N[0] / 2, 3 * N[1] / 2, 3 * N[2] / 2}, L)
    {
@ -370,6 +477,7 @@ public:
 #endif
    }

+    /// @brief destructor
    ~OrszagConvolver()
    {
        delete f1p_;
@ -380,6 +488,10 @@ public:
 #endif
    }

+    /// @brief convolve two fields
+    /// @tparam kfunc1 abstract function type generating data for the first field
+    /// @tparam kfunc2 abstract function type generating data for the second field
+    /// @tparam opp abstract function type for the output operation
    template <typename kfunc1, typename kfunc2, typename opp>
    void convolve2(kfunc1 kf1, kfunc2 kf2, opp output_op)
    {
@ -405,6 +517,11 @@ public:
        unpad(*f2p_, output_op);
    }

+    /// @brief convolve three fields
+    /// @tparam kfunc1 abstract function type generating data for the first field
+    /// @tparam kfunc2 abstract function type generating data for the second field
+    /// @tparam kfunc3 abstract function type generating data for the third field
+    /// @tparam opp abstract function type for the output operation
    template <typename kfunc1, typename kfunc2, typename kfunc3, typename opp>
    void convolve3(kfunc1 kf1, kfunc2 kf2, kfunc3 kf3, opp output_op)
    {
@ -428,6 +545,10 @@ public:

 private:

+    /// @brief unpad the result of a convolution and copy it to a grid
+    /// @tparam kdep_functor abstract function type generating data for the result
+    /// @param kfunc abstract function generating data for the result
+    /// @param fp grid to copy the result to
    template <typename kdep_functor>
    void pad_insert( kdep_functor kfunc, Grid_FFT<data_t> &fp)
    {
@ -472,6 +593,10 @@ private:
 #endif //defined(USE_MPI)
    }

+    /// @brief unpad the result of a convolution and write it to an output operator
+    /// @tparam operator_t abstract function type for the output operation
+    /// @param fp grid to copy the result from
+    /// @param output_op abstract function to write the result to
    template <typename operator_t>
    void unpad( Grid_FFT<data_t> &fp, operator_t output_op)
    {
--- a/include/cosmology_calculator.hh
+++ b/include/cosmology_calculator.hh
@ -152,8 +152,10 @@ private:

 public:
    
+    //! default constructor [deleted]
    calculator() = delete;
    
+    //! copy constructor [deleted]
    calculator(const calculator& c) = delete;

    //! constructor for a cosmology calculator object
@ -179,7 +181,7 @@ public:
        music::ilog << "Linear growth factors: D+_target = " << Dplus_target_ << ", D+_start = " << Dplus_start_ << std::endl;

        // set up transfer functions and compute normalisation
-        transfer_function_ = std::move(select_TransferFunction_plugin(cf, cosmo_param_));
+        transfer_function_ = select_TransferFunction_plugin(cf, cosmo_param_);
        transfer_function_->intialise();
        if( !transfer_function_->tf_isnormalised_ ){
            cosmo_param_.set("pnorm", this->compute_pnorm_from_sigma8() );
@ -192,16 +194,24 @@ public:

        music::ilog << std::setw(32) << std::left << "TF supports distinct CDM+baryons"
                    << " : " << (transfer_function_->tf_is_distinct() ? "yes" : "no") << std::endl;
+        music::ilog << std::setw(32) << std::left << "TF minimum wave number"
+                    << " : " << transfer_function_->get_kmin() << " h/Mpc" << std::endl;
        music::ilog << std::setw(32) << std::left << "TF maximum wave number"
                    << " : " << transfer_function_->get_kmax() << " h/Mpc" << std::endl;
+        if( std::sqrt(3.0)* 2.0*M_PI / cf.get_value<double>("setup","BoxLength") * cf.get_value<double>("setup","GridRes")/2  >= transfer_function_->get_kmax() ){
+            music::elog << "Simulation nyquist mode kny = " << std::sqrt(3.0)* 2.0*M_PI / cf.get_value<double>("setup","BoxLength") * cf.get_value<double>("setup","GridRes")/2 << " h/Mpc is beyond valid range of transfer function!" << std::endl;
+        }

        m_n_s_ = cosmo_param_["n_s"];
        m_sqrtpnorm_ = cosmo_param_["sqrtpnorm"];
    }

+    //! destructor
    ~calculator() { }

-    //! Write out a correctly scaled power spectrum at time a
+    /// @brief Write out a correctly scaled power spectrum at time a
+    /// @param a scale factor
+    /// @param fname file name
    void write_powerspectrum(real_t a, std::string fname) const
    {
        // const real_t Dplus0 = this->get_growth_factor(a);
@ -248,7 +258,8 @@ public:
        music::ilog << "Wrote power spectrum at a=" << a << " to file \'" << fname << "\'" << std::endl;
    }

-    //! Write out a correctly scaled power spectrum at starting time
+    /// @brief Write out a correctly scaled transfer function at time a
+    /// @param[in] fname filename to write to
    void write_transfer( std::string fname ) const
    {
        // const real_t Dplus0 = this->get_growth_factor(a);
@ -294,12 +305,16 @@ public:
        music::ilog << "Wrote input transfer functions at a=" << astart_ << " to file \'" << fname << "\'" << std::endl;
    }

+    /// @brief return the cosmological parameter object
+    /// @return cosmological parameter object
    const cosmology::parameters &get_parameters(void) const noexcept
    {
        return cosmo_param_;
    }

-    //! return the value of the Hubble function H(a) = dloga/dt 
+    /// @brief return the value of the Hubble function H(a) = dloga/dt 
+    /// @param[in] a scale factor
+    /// @return H(a)
    inline double H_of_a(double a) const noexcept
    {
        double HH2 = 0.0;
@ -310,13 +325,17 @@ public:
        return cosmo_param_["H0"] * std::sqrt(HH2);
    }

-    //! Computes the linear theory growth factor D+, normalised to D+(a=1)=1
+    /// @brief Computes the linear theory growth factor D+, normalised to D+(a=1)=1
+    /// @param[in] a scale factor
+    /// @return D+(a)
    real_t get_growth_factor(real_t a) const noexcept
    {
        return D_of_a_(a) / Dnow_;
    }

-    //! Computes the inverse of get_growth_factor
+    /// @brief Computes the inverse of get_growth_factor, i.e. a(D+)
+    /// @param[in] Dplus growth factor
+    /// @return a(D+)
    real_t get_a( real_t Dplus ) const noexcept
    {
        return a_of_D_( Dplus * Dnow_ );
@ -449,4 +468,4 @@ public:
 //     return pow(6.0 * mass / M_PI * std::sqrt(rho) * std::pow(G, 1.5), 1.0 / 3.0);
 // }

-} // namespace cosmology
+} // namespace cosmology
--- a/include/cosmology_parameters.hh
+++ b/include/cosmology_parameters.hh
@ -29,6 +29,7 @@ namespace cosmology
    class parameters
    {
    public:
+        //! type for default parameter maps
        using defaultmmap_t = std::map<std::string,std::map<std::string,real_t>>;

    private:
--- a/include/general.hh
+++ b/include/general.hh
@ -20,6 +20,7 @@

 #include <complex>
 #include <map>
+#include <memory>

 #if defined(USE_MPI)
 #include <mpi.h>
@ -184,6 +185,8 @@ inline void multitask_sync_barrier(void)
 #endif
 }

+extern size_t global_mem_high_mark, local_mem_high_mark;
+
 namespace CONFIG
 {
 extern int MPI_thread_support;
--- a/include/grid_fft.hh
+++ b/include/grid_fft.hh
@ -25,26 +25,27 @@
 #include <bounding_box.hh>
 #include <typeinfo>

-enum space_t
-{
-    kspace_id,
-    rspace_id
-};
-
+/// @brief enum to indicate whether a grid is currently in real or k-space
+enum space_t { kspace_id, rspace_id };

 #ifdef USE_MPI
-template <typename data_t_, bool bdistributed=true>
+#define GRID_FFT_DISTRIBUTED true
 #else
-template <typename data_t_, bool bdistributed=false>
+#define GRID_FFT_DISTRIBUTED false
 #endif
+
+/// @brief class for FFTable grids
+/// @tparam data_t_ data type
+/// @tparam bdistributed flag to indicate whether this grid is distributed in memory
+template <typename data_t_, bool bdistributed=GRID_FFT_DISTRIBUTED>
 class Grid_FFT
 {
 public:
-    using data_t = data_t_;
-    static constexpr bool is_distributed_trait{bdistributed};
+    using data_t = data_t_; ///< data type
+    static constexpr bool is_distributed_trait{bdistributed}; ///< flag to indicate whether this grid is distributed in memory

 protected:
-    using grid_fft_t = Grid_FFT<data_t,bdistributed>;
+    using grid_fft_t = Grid_FFT<data_t,bdistributed>; ///< type of this grid
    
 public:
    std::array<size_t, 3> n_, nhalf_;
@ -68,7 +69,11 @@ public:
    ptrdiff_t local_0_start_, local_1_start_;
    ptrdiff_t local_0_size_, local_1_size_;

-    //! constructor for FTable grid object
+    /// @brief constructor for FTable grid object
+    /// @param N number of grid points in each dimension
+    /// @param L physical size of the grid in each dimension
+    /// @param allocate flag to indicate whether to allocate memory for the grid
+    /// @param initialspace flag to indicate whether the grid is initially in real or k-space
    Grid_FFT(const std::array<size_t, 3> &N, const std::array<real_t, 3> &L, bool allocate = true, space_t initialspace = rspace_id)
        : n_(N), length_(L), space_(initialspace), data_(nullptr), cdata_(nullptr), plan_(nullptr), iplan_(nullptr), ballocated_( false )
    {
@ -77,11 +82,16 @@ public:
        }
    }

-    // avoid implicit copying of data
+    /// @brief copy constructor [deleted] -- to avoid implicit copying of data
    Grid_FFT(const grid_fft_t &g) = delete;

+    /// @brief assignment operator [deleted] -- to avoid implicit copying of data
+    grid_fft_t &operator=(const grid_fft_t &g) = delete;
+
+    /// @brief destructor
    ~Grid_FFT() { reset(); }

+    /// @brief reset grid object (free memory, etc.)
    void reset()
    {
        if (data_ != nullptr)  { FFTW_API(free)(data_); data_ = nullptr; }
@ -90,13 +100,18 @@ public:
        ballocated_ = false;
    }

+    /// @brief return the grid object for a given refinement level [dummy implementation for backward compatibility with MUSIC1]
    const grid_fft_t *get_grid(size_t ilevel) const { return this; }

-    //! return if grid object is 
+    /// @brief return if grid object is distributed in memory
+    /// @return true if grid object is distributed in memory
    bool is_distributed( void ) const noexcept { return bdistributed; }

+    /// @brief allocate memory for grid object
    void allocate();

+    /// @brief return if grid object is allocated
+    /// @return true if grid object is allocated
    bool is_allocated( void ) const noexcept { return ballocated_; }

    //! return the number of data_t elements that we store in the container
--- a/include/grid_ghosts.hh
+++ b/include/grid_ghosts.hh
@ -0,0 +1,228 @@
+// This file is part of monofonIC (MUSIC2)
+// A software package to generate ICs for cosmological simulations
+// Copyright (C) 2022 by Oliver Hahn
+// 
+// monofonIC is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// monofonIC is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#pragma once
+
+#include <array>
+#include <vector>
+#include <numeric>
+
+#include <general.hh>
+#include <math/vec3.hh>
+
+/// @brief implements a wrapper class for grids with ghost zones for MPI communication (slab decomposition)
+/// @tparam numghosts number of ghost zones on each side
+/// @tparam haveleft flag whether to have left ghost zone
+/// @tparam haveright flag whether to have right ghost zone
+/// @tparam grid_t grid type to wrap
+template <int numghosts, bool haveleft, bool haveright, typename grid_t>
+struct grid_with_ghosts
+{
+  using data_t = typename grid_t::data_t;
+  using vec3 = std::array<real_t, 3>;
+
+  static constexpr bool is_distributed_trait = grid_t::is_distributed_trait;
+  static constexpr int num_ghosts = numghosts;
+  static constexpr bool have_left = haveleft, have_right = haveright;
+
+  std::vector<data_t> boundary_left_, boundary_right_;
+  std::vector<int> local0starts_;
+  const grid_t &gridref;
+  size_t nx_, ny_, nz_, nzp_;
+
+
+  //... determine communication offsets
+  std::vector<ptrdiff_t> offsets_, sizes_;
+
+  /// @brief get task index for a given index
+  /// @param index index
+  /// @return task index
+  int get_task(ptrdiff_t index) const
+  {
+    int itask = 0;
+    while (itask < MPI::get_size() - 1 && offsets_[itask + 1] <= index)
+        ++itask;
+    return itask;
+  }
+
+  /// @brief constructor for grid with ghosts
+  /// @param g grid to wrap
+  explicit grid_with_ghosts(const grid_t &g)
+  : gridref(g), nx_(g.n_[0]), ny_(g.n_[1]), nz_(g.n_[2]), nzp_(g.n_[2]+2)
+  {
+    if (is_distributed_trait)
+    {
+      int ntasks(MPI::get_size());
+
+      offsets_.assign(ntasks+1, 0);
+      sizes_.assign(ntasks, 0);
+      
+      MPI_Allgather(&g.local_0_size_, 1, MPI_LONG_LONG, &sizes_[0], 1,
+                      MPI_LONG_LONG, MPI_COMM_WORLD);
+      MPI_Allgather(&g.local_0_start_, 1, MPI_LONG_LONG, &offsets_[0], 1,
+                      MPI_LONG_LONG, MPI_COMM_WORLD);
+      
+      for( int i=0; i< CONFIG::MPI_task_size; i++ ){
+          if( offsets_[i+1] < offsets_[i] + sizes_[i] ) offsets_[i+1] = offsets_[i] + sizes_[i];
+      }
+
+      update_ghosts_allow_multiple( g );
+    }
+  }
+
+  /// @brief update ghost zones via MPI communication
+  /// @param g grid to wrap
+  void update_ghosts_allow_multiple( const grid_t &g )
+  {
+  #if defined(USE_MPI)
+    //... exchange boundary
+    if( have_left  ) boundary_left_.assign(num_ghosts * ny_ * nzp_, data_t{0.0});
+    if( have_right ) boundary_right_.assign(num_ghosts * ny_ * nzp_, data_t{0.0});
+
+    size_t slicesz = ny_ * nzp_;
+
+    MPI_Status status;
+    std::vector<MPI_Request> req;
+    MPI_Request temp_req;
+
+    if( have_right ){
+      for( int itask=0; itask<CONFIG::MPI_task_size; ++itask ){
+        for( size_t i=0; i<num_ghosts; ++i ){
+          ptrdiff_t iglobal_request = (offsets_[itask] + sizes_[itask] + i) % g.n_[0];
+          if( iglobal_request >= g.local_0_start_ && iglobal_request < g.local_0_start_ + g.local_0_size_ ){
+            size_t ii = iglobal_request - g.local_0_start_;
+            MPI_Isend( &g.relem(ii*slicesz), slicesz, MPI::get_datatype<data_t>(), itask, iglobal_request, MPI_COMM_WORLD, &temp_req);
+            req.push_back(temp_req);
+          }
+        }
+      }
+
+      //--- receive data ------------------------------------------------------------
+      #pragma omp parallel if(CONFIG::MPI_threads_ok)
+      {
+          MPI_Status status;
+
+          #pragma omp for 
+          for( size_t i=0; i<num_ghosts; ++i ){
+            ptrdiff_t iglobal_request = (g.local_0_start_ + g.local_0_size_ + i) % g.n_[0];
+          
+            int recvfrom = get_task(iglobal_request);
+
+            //#pragma omp critical // need critical region here if we do "MPI_THREAD_FUNNELED", 
+            {
+                // receive data slice and check for MPI errors when in debug mode
+                status.MPI_ERROR = MPI_SUCCESS;
+                MPI_Recv(&boundary_right_[i*slicesz], (int)slicesz, MPI::get_datatype<data_t>(), recvfrom, (int)iglobal_request, MPI_COMM_WORLD, &status);
+                assert(status.MPI_ERROR == MPI_SUCCESS);
+            }
+          }
+      }
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+
+    if( have_left ){
+      for( int itask=0; itask<CONFIG::MPI_task_size; ++itask ){
+        for( size_t i=0; i<num_ghosts; ++i ){
+          ptrdiff_t iglobal_request = (offsets_[itask] + g.n_[0] - num_ghosts + i) % g.n_[0];
+          if( iglobal_request >= g.local_0_start_ && iglobal_request < g.local_0_start_ + g.local_0_size_ ){
+            size_t ii = iglobal_request - g.local_0_start_;
+            MPI_Isend( &g.relem(ii*slicesz), slicesz, MPI::get_datatype<data_t>(), itask, iglobal_request, MPI_COMM_WORLD, &temp_req);
+            req.push_back(temp_req);
+          }
+        }
+      }
+
+      //--- receive data ------------------------------------------------------------
+      #pragma omp parallel if(CONFIG::MPI_threads_ok)
+      {
+          MPI_Status status;
+
+          #pragma omp for 
+          for( size_t i=0; i<num_ghosts; ++i ){
+            ptrdiff_t iglobal_request = (g.local_0_start_ + g.n_[0] - num_ghosts + i) % g.n_[0];
+          
+            int recvfrom = get_task(iglobal_request);
+
+            //#pragma omp critical // need critical region here if we do "MPI_THREAD_FUNNELED", 
+            {
+                // receive data slice and check for MPI errors when in debug mode
+                status.MPI_ERROR = MPI_SUCCESS;
+                MPI_Recv(&boundary_left_[i*slicesz], (int)slicesz, MPI::get_datatype<data_t>(), recvfrom, (int)iglobal_request, MPI_COMM_WORLD, &status);
+                assert(status.MPI_ERROR == MPI_SUCCESS);
+            }
+          }
+      }
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    
+    for (size_t i = 0; i < req.size(); ++i)
+    {
+        // need to set status as wait does not necessarily modify it
+        // c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php
+        status.MPI_ERROR = MPI_SUCCESS;
+        // std::cout << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl;
+        int flag(1);
+        MPI_Test(&req[i], &flag, &status);
+        if( !flag ){
+            std::cout << "task " << CONFIG::MPI_task_rank << " : request No" << i << " unsuccessful" << std::endl;
+        }
+
+        MPI_Wait(&req[i], &status);
+        // std::cout << "---> ok!" << std::endl;
+        assert(status.MPI_ERROR == MPI_SUCCESS);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    #endif
+  }
+
+  /// @brief return the element at position (i,j,k) in the grid
+  /// @param i index in x direction
+  /// @param j index in y direction
+  /// @param k index in z direction
+  /// @return grid element at position (i,j,k)
+  data_t relem(const ptrdiff_t& i, const ptrdiff_t& j, const ptrdiff_t&k ) const noexcept
+  {
+    return this->relem({i,j,k});
+  }
+
+  /// @brief return the element at position (i,j,k) in the grid
+  /// @param pos position in the grid (array of size 3: {i,j,k})
+  /// @return grid element at position (i,j,k)
+  data_t relem(const std::array<ptrdiff_t, 3> &pos) const noexcept
+  {
+    const ptrdiff_t ix = pos[0];
+    const ptrdiff_t iy = (pos[1]+gridref.n_[1])%gridref.n_[1];
+    const ptrdiff_t iz = (pos[2]+gridref.n_[2])%gridref.n_[2];
+
+    if( is_distributed_trait ){
+      const ptrdiff_t localix = ix;
+      if( localix < 0 ){
+        return boundary_left_[((localix+num_ghosts)*ny_+iy)*nzp_+iz];
+      }else if( localix >= gridref.local_0_size_ ){
+        return boundary_right_[((localix-gridref.local_0_size_)*ny_+iy)*nzp_+iz];
+      }else{
+        return gridref.relem(localix, iy, iz);
+      }
+    }
+
+    return gridref.relem((ix+gridref.n_[0])%gridref.n_[0], iy, iz);
+  }
+};
+
--- a/include/grid_interpolate.hh
+++ b/include/grid_interpolate.hh
@ -175,13 +175,14 @@ struct grid_interpolate

      MPI_Alltoall(&sendcounts[0], 1, MPI_INT, &recvcounts[0], 1, MPI_INT, MPI_COMM_WORLD);

-      size_t tot_receive = recvcounts[0], tot_send = sendcounts[0];
+      size_t tot_receive = recvcounts[0];
+//      size_t tot_send = sendcounts[0];
      for (int i = 1; i < MPI::get_size(); ++i)
      {
        sendoffsets[i] = sendcounts[i - 1] + sendoffsets[i - 1];
        recvoffsets[i] = recvcounts[i - 1] + recvoffsets[i - 1];
        tot_receive += recvcounts[i];
-        tot_send += sendcounts[i];
+//        tot_send += sendcounts[i];
      }

      std::vector<vec3> recvbuf(tot_receive/3,{0.,0.,0.});
--- a/include/math/interpolate.hh
+++ b/include/math/interpolate.hh
@ -22,16 +22,22 @@
 #include <gsl/gsl_spline.h>
 #include <gsl/gsl_errno.h>

+
+/// @brief 1D interpolation class
+/// @tparam logx static flag to indicate logarithmic interpolation in x
+/// @tparam logy static flag to indicate logarithmic interpolation in y
+/// @tparam periodic static flag to indicate periodic interpolation in x
 template <bool logx, bool logy, bool periodic>
 class interpolated_function_1d
 {

 private:
-  bool isinit_;
-  std::vector<double> data_x_, data_y_;
-  gsl_interp_accel *gsl_ia_;
-  gsl_spline *gsl_sp_;
+  bool isinit_; ///< flag to indicate whether the interpolation has been initialized
+  std::vector<double> data_x_, data_y_; ///< data vectors
+  gsl_interp_accel *gsl_ia_; ///< GSL interpolation accelerator
+  gsl_spline *gsl_sp_; ///< GSL spline object

+  /// @brief deallocate GSL objects
  void deallocate()
  {
    gsl_spline_free(gsl_sp_);
@ -39,10 +45,16 @@ private:
  }

 public:
+
+  /// @brief default copy constructor (deleted)
  interpolated_function_1d(const interpolated_function_1d &) = delete;

+  /// @brief empty constructor (without data)
  interpolated_function_1d() : isinit_(false){}

+  /// @brief constructor with data
+  /// @param data_x x data vector
+  /// @param data_y y data vector
  interpolated_function_1d(const std::vector<double> &data_x, const std::vector<double> &data_y)
  : isinit_(false)
  {
@ -50,11 +62,15 @@ public:
    this->set_data( data_x, data_y );
  }

+  /// @brief destructor
  ~interpolated_function_1d()
  {
    if (isinit_) this->deallocate();
  }

+  /// @brief set data
+  /// @param data_x x data vector
+  /// @param data_y y data vector
  void set_data(const std::vector<double> &data_x, const std::vector<double> &data_y)
  {
    data_x_ = data_x;
@ -75,6 +91,9 @@ public:
    isinit_ = true;
  }

+  /// @brief evaluate the interpolation
+  /// @param x x value
+  /// @return y value
  double operator()(double x) const noexcept
  {
    assert( isinit_ && !(logx&&x<=0.0) );
--- a/include/math/mat3.hh
+++ b/include/math/mat3.hh
@ -22,18 +22,20 @@

 #include <math/vec3.hh>

-//! class for 3x3 matrix calculations
+/// @brief class for 3x3 matrix calculations
+/// @tparam T type of matrix elements
 template<typename T>
 class mat3_t{
 protected:
-    std::array<T,9> data_;
-    std::array<double,9> data_double_;
-    gsl_matrix_view m_;
-    gsl_vector *eval_;
-    gsl_matrix *evec_;
-	gsl_eigen_symmv_workspace * wsp_;
-    bool bdid_alloc_gsl_;
+    std::array<T,9> data_; //< data array
+    std::array<double,9> data_double_; //< data array for GSL operations
+    gsl_matrix_view m_; //< GSL matrix view
+    gsl_vector *eval_; //< GSL eigenvalue vector
+    gsl_matrix *evec_; //< GSL eigenvector matrix
+	gsl_eigen_symmv_workspace * wsp_; //< GSL workspace
+    bool bdid_alloc_gsl_; //< flag to indicate whether GSL memory has been allocated
 						
+    /// @brief initialize GSL memory
    void init_gsl(){
        // allocate memory for GSL operations if we haven't done so yet
        if( !bdid_alloc_gsl_ )
@ -54,6 +56,7 @@ protected:
        }
    }

+    /// @brief free GSL memory
    void free_gsl(){
        // free memory for GSL operations if it was allocated
        if( bdid_alloc_gsl_ )
@ -66,54 +69,76 @@ protected:

 public:

+    /// @brief default constructor
    mat3_t()
    : bdid_alloc_gsl_(false) 
    {}

-    //! copy constructor
+    /// @brief copy constructor
+    /// @param m matrix to copy
    mat3_t( const mat3_t<T> &m)
    : data_(m.data_), bdid_alloc_gsl_(false) 
    {}
    
-    //! move constructor
+    /// @brief move constructor
+    /// @param m matrix to move
    mat3_t( mat3_t<T> &&m)
    : data_(std::move(m.data_)), bdid_alloc_gsl_(false) 
    {}

-    //! construct mat3_t from initializer list
+    /// @brief construct mat3_t from initializer list
+    /// @param e initializer list
    template<typename ...E>
    mat3_t(E&&...e) 
    : data_{{std::forward<E>(e)...}}, bdid_alloc_gsl_(false)
    {}

+    /// @brief assignment operator
+    /// @param m matrix to copy
+    /// @return reference to this
    mat3_t<T>& operator=(const mat3_t<T>& m) noexcept{
        data_ = m.data_;
        return *this;
    }

+    /// @brief move assignment operator
+    /// @param m matrix to move
+    /// @return reference to this
    mat3_t<T>& operator=(const mat3_t<T>&& m) noexcept{
        data_ = std::move(m.data_);
        return *this;
    }

-    //! destructor
+    /// @brief destructor
    ~mat3_t(){
        this->free_gsl();
    }
    
-    //! bracket index access to vector components
+    /// @brief bracket index access to flattened matrix components
+    /// @param i index
+    /// @return reference to i-th component
    T &operator[](size_t i) noexcept { return data_[i];}
    
-    //! const bracket index access to vector components
+    /// @brief const bracket index access to flattened matrix components
+    /// @param i index
+    /// @return const reference to i-th component
    const T &operator[](size_t i) const noexcept { return data_[i]; }

-    //! matrix 2d index access
+    /// @brief matrix 2d index access
+    /// @param i row index
+    /// @param j column index
+    /// @return reference to (i,j)-th component
    T &operator()(size_t i, size_t j) noexcept { return data_[3*i+j]; }

-    //! const matrix 2d index access
+    /// @brief const matrix 2d index access
+    /// @param i row index
+    /// @param j column index
+    /// @return const reference to (i,j)-th component
    const T &operator()(size_t i, size_t j) const noexcept { return data_[3*i+j]; }

-    //! in-place addition
+    /// @brief in-place addition
+    /// @param rhs matrix to add
+    /// @return reference to this
    mat3_t<T>& operator+=( const mat3_t<T>& rhs ) noexcept{
        for (size_t i = 0; i < 9; ++i) {
           (*this)[i] += rhs[i];
@ -121,7 +146,9 @@ public:
        return *this;
    }

-    //! in-place subtraction
+    /// @brief in-place subtraction
+    /// @param rhs matrix to subtract
+    /// @return reference to this
    mat3_t<T>& operator-=( const mat3_t<T>& rhs ) noexcept{
        for (size_t i = 0; i < 9; ++i) {
           (*this)[i] -= rhs[i];
@ -129,10 +156,16 @@ public:
        return *this;
    }

+    /// @brief zeroing of matrix
    void zero() noexcept{
        for (size_t i = 0; i < 9; ++i) data_[i]=0;
    }

+    /// @brief compute eigenvalues and eigenvectors
+    /// @param evals eigenvalues
+    /// @param evec1 first eigenvector
+    /// @param evec2 second eigenvector
+    /// @param evec3 third eigenvector
    void eigen( vec3_t<T>& evals, vec3_t<T>& evec1, vec3_t<T>& evec2, vec3_t<T>& evec3_t )
    {
        this->init_gsl();
@ -149,6 +182,11 @@ public:
    }
 };

+/// @brief matrix addition
+/// @tparam T type of matrix components
+/// @param lhs left hand side matrix
+/// @param rhs right hand side matrix
+/// @return matrix result = lhs + rhs
 template<typename T>
 constexpr const mat3_t<T> operator+(const mat3_t<T> &lhs, const mat3_t<T> &rhs) noexcept
 {
@ -159,7 +197,11 @@ constexpr const mat3_t<T> operator+(const mat3_t<T> &lhs, const mat3_t<T> &rhs)
    return result;
 }

-// matrix - vector multiplication
+/// @brief matrix - vector multiplication
+/// @tparam T type of matrix and vector components
+/// @param A matrix
+/// @param v vector
+/// @return vector result = A*v
 template<typename T>
 inline vec3_t<T> operator*( const mat3_t<T> &A, const vec3_t<T> &v ) noexcept
 {
--- a/include/math/vec3.hh
+++ b/include/math/vec3.hh
@ -17,7 +17,8 @@

 #pragma once

-//! implements a simple class of 3-vectors of arbitrary scalar type
+/// @brief implements a simple class of 3-vectors of arbitrary scalar type
+/// @tparam T scalar type
 template< typename T >
 class vec3_t{
 private:
@ -28,29 +29,35 @@ public:
    //! expose access to elements via references
    T &x,&y,&z;

-    //! empty constructor
+    /// @brief empty constructor
    vec3_t()
-    : x(data_[0]),y(data_[1]),z(data_[2]){}
+    : data_{{T(0),T(0),T(0)}},x(data_[0]),y(data_[1]),z(data_[2]){}

-    //! copy constructor
+    /// @brief copy constructor
+    /// @param v vector to copy from
    vec3_t( const vec3_t<T> &v)
    : data_(v.data_), x(data_[0]),y(data_[1]),z(data_[2]){}

-    //! copy constructor for non-const reference, needed to avoid variadic template being called for non-const reference
+    /// @brief copy constructor for non-const reference, needed to avoid variadic template being called for non-const reference
+    /// @param v vector to copy from
    vec3_t( vec3_t<T>& v)
    : data_(v.data_), x(data_[0]),y(data_[1]),z(data_[2]){}

-    //! move constructor
+    /// @brief move constructor
+    /// @param v vector to move from
    vec3_t( vec3_t<T> &&v)
    : data_(std::move(v.data_)), x(data_[0]), y(data_[1]), z(data_[2]){}

    //! construct vec3_t from initializer list
-    template<typename ...E>
-    vec3_t(E&&...e) 
-    : data_{{std::forward<E>(e)...}}, x{data_[0]}, y{data_[1]}, z{data_[2]}
-    {}
-    // vec3_t( T a, T b, T c ) 
-    // : data_{{a,b,c}}, x(data_[0]), y(data_[1]), z(data_[2]){}
+//    template<typename ...E>
+//    vec3_t(E&&...e) 
+//    : data_{{std::forward<E>(e)...}}, x{data_[0]}, y{data_[1]}, z{data_[2]}
+//    {}
+    vec3_t( T a, T b, T c )
+        : data_{{a,b,c}}, x(data_[0]), y(data_[1]), z(data_[2]){}
+  
+    explicit vec3_t( T a)
+        : data_{{a,a,a}}, x(data_[0]), y(data_[1]), z(data_[2]){}
    
    //! bracket index access to vector components
    T &operator[](size_t i) noexcept{ return data_[i];}
--- a/include/memory_stat.hh
+++ b/include/memory_stat.hh
@ -0,0 +1,125 @@
+/*
+ * Author:  David Robert Nadeau
+ * Site:    http://NadeauSoftware.com/
+ * License: Creative Commons Attribution 3.0 Unported License
+ *          http://creativecommons.org/licenses/by/3.0/deed.en_US
+ */
+#pragma once
+
+namespace memory
+{
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <psapi.h>
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach/mach.h>
+
+#elif (defined(_AIX) || defined(__TOS__AIX__)) || (defined(__sun__) || defined(__sun) || defined(sun) && (defined(__SVR4) || defined(__svr4__)))
+#include <fcntl.h>
+#include <procfs.h>
+
+#elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__)
+#include <stdio.h>
+
+#endif
+
+#else
+#error "Cannot define getPeakRSS( ) or getCurrentRSS( ) for an unknown OS."
+#endif
+
+
+/**
+ * Returns the peak (maximum so far) resident set size (physical
+ * memory use) measured in bytes, or zero if the value cannot be
+ * determined on this OS.
+ */
+inline size_t getPeakRSS( )
+{
+#if defined(_WIN32)
+    /* Windows -------------------------------------------------- */
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.PeakWorkingSetSize;
+
+#elif (defined(_AIX) || defined(__TOS__AIX__)) || (defined(__sun__) || defined(__sun) || defined(sun) && (defined(__SVR4) || defined(__svr4__)))
+    /* AIX and Solaris ------------------------------------------ */
+    struct psinfo psinfo;
+    int fd = -1;
+    if ( (fd = open( "/proc/self/psinfo", O_RDONLY )) == -1 )
+        return (size_t)0L;      /* Can't open? */
+    if ( read( fd, &psinfo, sizeof(psinfo) ) != sizeof(psinfo) )
+    {
+        close( fd );
+        return (size_t)0L;      /* Can't read? */
+    }
+    close( fd );
+    return (size_t)(psinfo.pr_rssize * 1024L);
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+    /* BSD, Linux, and OSX -------------------------------------- */
+    struct rusage rusage;
+    getrusage( RUSAGE_SELF, &rusage );
+#if defined(__APPLE__) && defined(__MACH__)
+    return (size_t)rusage.ru_maxrss;
+#else
+    return (size_t)(rusage.ru_maxrss * 1024L);
+#endif
+
+#else
+    /* Unknown OS ----------------------------------------------- */
+    return (size_t)0L;          /* Unsupported. */
+#endif
+}
+
+
+
+
+
+/**
+ * Returns the current resident set size (physical memory use) measured
+ * in bytes, or zero if the value cannot be determined on this OS.
+ */
+inline size_t getCurrentRSS( )
+{
+#if defined(_WIN32)
+    /* Windows -------------------------------------------------- */
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.WorkingSetSize;
+
+#elif defined(__APPLE__) && defined(__MACH__)
+    /* OSX ------------------------------------------------------ */
+    struct mach_task_basic_info info;
+    mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+    if ( task_info( mach_task_self( ), MACH_TASK_BASIC_INFO,
+        (task_info_t)&info, &infoCount ) != KERN_SUCCESS )
+        return (size_t)0L;      /* Can't access? */
+    return (size_t)info.resident_size;
+
+#elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__)
+    /* Linux ---------------------------------------------------- */
+    long rss = 0L;
+    FILE* fp = NULL;
+    if ( (fp = fopen( "/proc/self/statm", "r" )) == NULL )
+        return (size_t)0L;      /* Can't open? */
+    if ( fscanf( fp, "%*s%ld", &rss ) != 1 )
+    {
+        fclose( fp );
+        return (size_t)0L;      /* Can't read? */
+    }
+    fclose( fp );
+    return (size_t)rss * (size_t)sysconf( _SC_PAGESIZE);
+
+#else
+    /* AIX, BSD, Solaris, and Unknown OS ------------------------ */
+    return (size_t)0L;          /* Unsupported. */
+#endif
+}
+
+};
--- a/include/particle_generator.hh
+++ b/include/particle_generator.hh
@ -17,6 +17,7 @@
 #pragma once

 #include <math/vec3.hh>
+#include <grid_ghosts.hh>
 #include <grid_interpolate.hh>

 #if defined(USE_HDF5)
@ -29,13 +30,20 @@ namespace particle

    enum lattice
    {
-        lattice_glass = -1,
+        lattice_masked = -2, // masked lattices are SC lattices with a specifiable mask leaving out particles
+        lattice_glass = -1,  // glass: needs a glass file
        lattice_sc = 0,  // SC : simple cubic
        lattice_bcc = 1, // BCC: body-centered cubic
        lattice_fcc = 2, // FCC: face-centered cubic
        lattice_rsc = 3, // RSC: refined simple cubic
    };

+    // const std::vector<std::vector<bool>> lattice_masks =
+    // {
+    //     // mask from Richings et al. https://arxiv.org/pdf/2005.14495.pdf
+    //     {true,true,true,true,true,true,true,false},
+    // };
+
    const std::vector<std::vector<vec3_t<real_t>>> lattice_shifts =
        {
            // first shift must always be zero! (otherwise set_positions and set_velocities break)
@ -149,20 +157,75 @@ namespace particle

        private:
        particle::container particles_;
+        size_t global_num_particles_;
+
+        static constexpr int masksize_ = 2;
+        std::array<int, masksize_*masksize_*masksize_> particle_type_mask_;
+
+        inline int get_mask_value( const vec3_t<size_t>& global_idx_3d ) const
+        {
+            int sig = ((global_idx_3d[0]%masksize_)*masksize_
+                       +global_idx_3d[1]%masksize_)*masksize_
+                       +global_idx_3d[2]%masksize_;
+            return particle_type_mask_[sig];
+        }
+
+        template< typename ggrid_t >
+        inline real_t get_mean_mask_value( const ggrid_t& gg_field, const vec3_t<size_t>& global_idx_3d, size_t i, size_t j, size_t k, int lattice_index ) const
+        {
+            ptrdiff_t ox = (ptrdiff_t)i-(ptrdiff_t)(global_idx_3d[0]%masksize_);
+            ptrdiff_t oy = (ptrdiff_t)j-(ptrdiff_t)(global_idx_3d[1]%masksize_);
+            ptrdiff_t oz = (ptrdiff_t)k-(ptrdiff_t)(global_idx_3d[2]%masksize_);
+            size_t count_full{0}, count_masked{0};
+            real_t mean_full{0.0}, mean_masked{0.0};
+            for( size_t i=0; i<masksize_; ++i ){
+                for( size_t j=0; j<masksize_; ++j ){
+                    for( size_t k=0; k<masksize_; ++k ){
+                        mean_full += gg_field.relem( ox+i, oy+j, oz+k );
+                        ++count_full;
+                        if( particle_type_mask_[4*i+2*j+k] == lattice_index ){
+                            mean_masked += gg_field.relem( ox+i, oy+j, oz+k );
+                            ++count_masked;
+                        }
+                    }
+                }
+            }
+            mean_full /= count_full;
+            mean_masked /= count_masked;
+            return mean_masked - mean_full;
+        }

        public:
-        lattice_generator(lattice lattice_type, const bool b64reals, const bool b64ids, const bool bwithmasses, size_t IDoffset, const field_t &field, config_file &cf)
-        {
-            if (lattice_type != lattice_glass)
-            {
-                music::wlog << "Glass ICs will currently be incorrect due to disabled ghost zone updates! ";
+        /**
+         * @brief Construct a new lattice generator object
+         * 
+         * @param lattice_type Type of the lattice (currently: 0=SC, 1=BCC, 2=FCC, 3=double SC, -1=glass, -2=masked SC)
+         * @param lattice_index Index of the 'atom type', i.e. whether CDM, baryons, ... (currently only supports 0 or 1)
+         * @param b64reals Boolean whether 64bit floating point shall be used to store positions/velocities/masses
+         * @param b64ids  Boolean whether 64bit integers should be used for IDS
+         * @param bwithmasses Boolean whether distinct masses for each particle shall be stored
+         * @param IDoffset The global ID offset to be applied to this generation of particles
+         * @param field Reference to the field from which particles shall be generated (used only to set dimensions)
+         * @param cf Reference to the config_file object
+         */

+        lattice_generator(lattice lattice_type, int lattice_index, const bool b64reals, const bool b64ids, const bool bwithmasses, size_t IDoffset, const field_t &field, config_file &cf)
+        : global_num_particles_(0)
+        {
+            // initialise the particle mask with zeros (only used if lattice_type==lattice_masked)
+            for( auto& m : particle_type_mask_) m = 0;
+
+            if (lattice_type >= 0) // These are the Bravais lattices
+            {
                // number of modes present in the field
                const size_t num_p_in_load = field.local_size();
                // unless SC lattice is used, particle number is a multiple of the number of modes (=num_p_in_load):
                const size_t overload = 1ull << std::max<int>(0, lattice_type); // 1 for sc, 2 for bcc, 4 for fcc, 8 for rsc
                // allocate memory for all local particles
                particles_.allocate(overload * num_p_in_load, b64reals, b64ids, bwithmasses);
+                // set the global number of particles for this lattice_type and lattice_index
+                global_num_particles_ = field.global_size() * overload;
+
                // set particle IDs to the Lagrangian coordinate (1D encoded) with additionally the field shift encoded as well

                IDoffset = IDoffset * overload * field.global_size();
@ -188,8 +251,99 @@ namespace particle
                    }
                }
            }
-            else
+            else if( lattice_type == lattice_masked )
            {
+                if( field.global_size()%8 != 0 ){
+                    music::elog << "For masked lattice type, linear field resolution must be a multiple of two." << std::endl;
+                    abort();
+                }
+
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For masked lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
+                // set the particle mask 
+                if( cf.get_value_safe("setup","DoBaryons",false) )
+                {
+                    unsigned mask_type = cf.get_value_safe("setup","ParticleMaskType",3);
+                    // mask everywehere 0, except the last element
+                    for( auto& m : particle_type_mask_) m = -1;
+                    particle_type_mask_[0] = 0; // CDM at corner of unit cube
+                    if( mask_type & 1<<0 ) {
+                        // edge centers
+                        particle_type_mask_[1] = 0; // CDM
+                        particle_type_mask_[2] = 0; // CDM
+                        particle_type_mask_[4] = 0; // CDM
+                    }
+                    if( mask_type & 1<<1 ){
+                        // face centers
+                        particle_type_mask_[3] = 0; // CDM
+                        particle_type_mask_[5] = 0; // CDM
+                        particle_type_mask_[6] = 0; // CDM
+                    }
+                    particle_type_mask_[7] = 1; // baryon at cell center (aka opposite corner)
+                }else{
+                    // mask everywhere 0, all particle locations are occupied by CDM
+                    for( auto& m : particle_type_mask_) m = 0;
+                }
+
+                // count number of particles taking into account masking
+                size_t ipcount = 0;
+                for (size_t i = 0; i < field.rsize(0); ++i)
+                {
+                    for (size_t j = 0; j < field.rsize(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.rsize(2); ++k)
+                        {
+                            if( this->get_mask_value(field.get_cell_idx_3d(i,j,k)) != lattice_index ) continue;
+                            ++ipcount;
+                        }
+                    }
+                }
+
+                // set global number of particles
+#if defined(USE_MPI)
+                MPI_Allreduce( &ipcount, &global_num_particles_, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD );
+#else
+                global_num_particles_ = ipcount;
+#endif
+
+                // number of modes present in the field
+                const size_t num_p_in_load = ipcount;
+
+                // allocate memory for all local particles
+                particles_.allocate(num_p_in_load, b64reals, b64ids, bwithmasses);
+
+                // set particle IDs to the Lagrangian coordinate (1D encoded) with additionally the field shift encoded as well
+                IDoffset = IDoffset * field.global_size();
+
+                for (size_t i = 0, ipcount = 0; i < field.rsize(0); ++i)
+                {
+                    for (size_t j = 0; j < field.rsize(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.rsize(2); ++k)
+                        {
+                            if( this->get_mask_value(field.get_cell_idx_3d(i,j,k)) != lattice_index ) continue;
+
+                            if (b64ids)
+                            {
+                                particles_.set_id64(ipcount, IDoffset + field.get_cell_idx_1d(i, j, k));
+                            }
+                            else
+                            {
+                                particles_.set_id32(ipcount, IDoffset + field.get_cell_idx_1d(i, j, k));
+                            }
+                            ++ipcount;
+                        }
+                    }
+                }
+
+            }
+            else if( lattice_type == lattice_glass )
+            {
+                music::wlog << "Glass ICs will currently be incorrect due to disabled ghost zone updates! ";
+
                glass_ptr_ = std::make_unique<glass>( cf, field );
                particles_.allocate(glass_ptr_->size(), b64reals, b64ids, false);

@ -206,14 +360,24 @@ namespace particle
                    }
                }
            }
+
+            music::ilog << "Created Particles [" << lattice_index << "] : " << global_num_particles_ << std::endl;
+
        }

        // invalidates field, phase shifted to unspecified position after return
-        void set_masses(const lattice lattice_type, bool is_second_lattice, const real_t munit, const bool b64reals, field_t &field, config_file &cf)
+        void set_masses(const lattice lattice_type, int lattice_index, const real_t munit, const bool b64reals, field_t &field, config_file &cf)
        {
-            // works only for Bravais types
-            if (lattice_type >= 0)
+            // works only for Bravais types and masked type
+            assert( lattice_type>=0 || lattice_type==lattice_masked );
+
+            if (lattice_type >= 0) // Bravais lattices
            {
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For Bravais lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
                const size_t overload = 1ull << std::max<int>(0, lattice_type); // 1 for sc, 2 for bcc, 4 for fcc, 8 for rsc
                const size_t num_p_in_load = field.local_size();
                const real_t pmeanmass = munit / real_t(field.global_size()* overload);
@ -225,7 +389,7 @@ namespace particle
                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
                {
                    // if we are dealing with the secondary lattice, apply a global shift
-                    if (ishift == 0 && is_second_lattice)
+                    if (ishift == 0 && lattice_index > 0)
                    {
                        field.shift_field(second_lattice_shift[lattice_type]);
                    }
@ -261,7 +425,69 @@ namespace particle
                music::ilog << "Particle Mass :  mean/munit = " << mean_pm/munit  << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl;
                if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t  the starting redshift or disabling baryon decaying modes." << std::endl;
                if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t  redshift or disable baryon decaying modes!" << std::endl;
+            } else if( lattice_type == lattice_masked ) {
+                if( field.global_size()%8 != 0 ){
+                    music::elog << "For masked lattice type, linear field resolution must be a multiple of two." << std::endl;
+                    abort();
+                }

+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For masked lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
+                grid_with_ghosts<1, true, true, field_t> gg_field(field);
+
+                const real_t pmeanmass = munit / global_num_particles_;
+
+                bool bmass_negative = false;
+                real_t mean_pm = 0.0;//field.mean() * pmeanmass;
+                real_t std_pm  = 0.0; //field.std()  * pmeanmass;
+
+                // read out values from phase shifted field and set assoc. particle's value
+                for (size_t i = 0, ipcount = 0; i < field.size(0); ++i)
+                {
+                    for (size_t j = 0; j < field.size(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.size(2); ++k)
+                        {
+                            const auto idx3 = field.get_cell_idx_3d(i,j,k);
+
+                            if( this->get_mask_value(idx3) != lattice_index ) continue;
+
+                            const auto mean_mask = this->get_mean_mask_value( gg_field, idx3, i, j, k, lattice_index );
+
+                            // get
+                            const auto pmass = pmeanmass * (field.relem(i, j, k) - mean_mask);
+
+                            // check for negative mass
+                            bmass_negative |= pmass<0.0;
+
+                            // set
+                            if (b64reals) particles_.set_mass64(ipcount++, pmass);
+                            else particles_.set_mass32(ipcount++, pmass);
+
+                            // statistics
+                            mean_pm += pmass;
+                            std_pm += pmass*pmass;
+                        }
+                    }
+                }
+                #if defined(USE_MPI)
+                {
+                    double local_mean_pm = mean_pm, local_std_pm = std_pm;
+                    MPI_Allreduce( &local_mean_pm, &mean_pm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                    MPI_Allreduce( &local_std_pm, &std_pm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                }
+                #endif
+                mean_pm /= global_num_particles_;
+                std_pm /= global_num_particles_;
+                std_pm -= mean_pm*mean_pm;
+                
+                // diagnostics
+                music::ilog << "Particle Mass :  mean/munit = " << mean_pm/munit  << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl;
+                if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t  the starting redshift or disabling baryon decaying modes." << std::endl;
+                if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t  redshift or disable baryon decaying modes!" << std::endl;
            }else{
                // should not happen
                music::elog << "Cannot have individual particle masses for glasses!" << std::endl;
@ -270,15 +496,21 @@ namespace particle
        }

        // invalidates field, phase shifted to unspecified position after return
-        void set_positions(const lattice lattice_type, bool is_second_lattice, int idim, real_t lunit, const bool b64reals, field_t &field, config_file &cf)
+        void set_positions(const lattice lattice_type, int lattice_index, int idim, real_t lunit, const bool b64reals, field_t &field, config_file &cf)
        {
-            if (lattice_type >= 0)
+            
+            if (lattice_type >= 0) // Bravais lattice
            {
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For Bravais lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
                const size_t num_p_in_load = field.local_size();
                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
                {
                    // if we are dealing with the secondary lattice, apply a global shift
-                    if (ishift == 0 && is_second_lattice)
+                    if (ishift == 0 && lattice_index==1)
                    {
                        field.shift_field(second_lattice_shift[lattice_type]);
                    }
@ -296,7 +528,7 @@ namespace particle
                        {
                            for (size_t k = 0; k < field.size(2); ++k)
                            {
-                                auto pos = field.template get_unit_r_shifted<real_t>(i, j, k, lattice_shifts[lattice_type][ishift] + (is_second_lattice ? second_lattice_shift[lattice_type] : vec3_t<real_t>{real_t(0.), real_t(0.), real_t(0.)}));
+                                auto pos = field.template get_unit_r_shifted<real_t>(i, j, k, lattice_shifts[lattice_type][ishift] + (lattice_index==1 ? second_lattice_shift[lattice_type] : vec3_t<real_t>{real_t(0.), real_t(0.), real_t(0.)}));
                                if (b64reals)
                                {
                                    particles_.set_pos64(ipcount++, idim, pos[idim] * lunit + field.relem(i, j, k));
@ -310,6 +542,38 @@ namespace particle
                    }
                }
            }
+            else if( lattice_type == lattice_masked ) 
+            {
+                if( field.global_size()%8 != 0 ){
+                    music::elog << "For masked lattice type, linear field resolution must be a multiple of two." << std::endl;
+                    abort();
+                }
+
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For masked lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
+                for (size_t i = 0, ipcount = 0; i < field.size(0); ++i)
+                {
+                    for (size_t j = 0; j < field.size(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.size(2); ++k)
+                        {
+                            if( this->get_mask_value(field.get_cell_idx_3d(i,j,k)) != lattice_index ) continue;
+
+                            // get position (in box units) of the current cell of 3d array 'field'
+                            auto pos = field.template get_unit_r<real_t>(i, j, k);
+                            // add the displacement to get the particle position
+                            if (b64reals){
+                                particles_.set_pos64(ipcount++, idim, pos[idim] * lunit + field.relem(i, j, k));
+                            }else{
+                                particles_.set_pos32(ipcount++, idim, pos[idim] * lunit + field.relem(i, j, k));
+                            }
+                        }
+                    }
+                }
+            }
            else
            {
                glass_ptr_->update_ghosts( field );
@ -330,15 +594,20 @@ namespace particle
            }
        }

-        void set_velocities(lattice lattice_type, bool is_second_lattice, int idim, const bool b64reals, field_t &field, config_file &cf)
+        void set_velocities(lattice lattice_type, int lattice_index, int idim, const bool b64reals, field_t &field, config_file &cf)
        {
-            if (lattice_type >= 0)
+            if (lattice_type >= 0) // Bravais lattice
            {
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For Bravais lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
                const size_t num_p_in_load = field.local_size();
                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
                {
                    // if we are dealing with the secondary lattice, apply a global shift
-                    if (ishift == 0 && is_second_lattice)
+                    if (ishift == 0 && lattice_index==1)
                    {
                        field.shift_field(second_lattice_shift[lattice_type]);
                    }
@ -368,6 +637,35 @@ namespace particle
                    }
                }
            }
+            else if( lattice_type == lattice_masked ) 
+            {
+                if( field.global_size()%8 != 0 ){
+                    music::elog << "For masked lattice type, linear field resolution must be a multiple of two." << std::endl;
+                    abort();
+                }
+
+                if( lattice_index > 1 || lattice_index < 0 ){
+                    music::elog << "For masked lattice type, lattice index must be 0 or 1." << std::endl;
+                    abort();
+                }
+
+                for (size_t i = 0, ipcount = 0; i < field.size(0); ++i)
+                {
+                    for (size_t j = 0; j < field.size(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.size(2); ++k)
+                        {
+                            if( this->get_mask_value(field.get_cell_idx_3d(i,j,k)) != lattice_index ) continue;
+
+                            if (b64reals){
+                                particles_.set_vel64(ipcount++, idim, field.relem(i, j, k));
+                            }else{
+                                particles_.set_vel32(ipcount++, idim, field.relem(i, j, k));
+                            }
+                        }
+                    }
+                }
+            }
            else
            {
                glass_ptr_->update_ghosts( field );
--- a/include/particle_plt.hh
+++ b/include/particle_plt.hh
@ -421,7 +421,7 @@ private:
                    
                    if( !is_in(i,j,k,mat_invrecip)  ){
                        auto average_lv = [&]( const auto& t1, const auto& t2, const auto& t3, vec3_t<real_t>& v, vec3_t<real_t>& l ) {
-                            v = real_t(0.0); l = real_t(0.0);
+                            v = vec3_t<real_t>(0.0); l = vec3_t<real_t>(0.0);
                            int count(0);
                            
                            auto add_lv = [&]( auto it ) -> void {
@ -586,4 +586,4 @@ public:

 };

-}
+}
--- a/src/grid_fft.cc
+++ b/src/grid_fft.cc
@ -19,6 +19,24 @@
 #include <grid_fft.hh>
 #include <thread>

+#include "memory_stat.hh"
+
+void memory_report(void) 
+{
+    //... report memory usage
+    size_t curr_mem_high_mark = 0;
+    local_mem_high_mark = memory::getCurrentRSS();
+#if defined(USE_MPI)
+    MPI_Allreduce(&local_mem_high_mark, &curr_mem_high_mark, 1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, MPI_COMM_WORLD);
+#else
+    curr_mem_high_mark = local_mem_high_mark;
+#endif
+    if( curr_mem_high_mark > 1.1*global_mem_high_mark ){
+        music::ilog << std::setw(57) << std::setfill(' ') << std::right << "mem high: " << std::setw(8) << curr_mem_high_mark/(1ull<<20) << " MBytes / task" << std::endl;
+        global_mem_high_mark = curr_mem_high_mark;
+    }
+}
+
 template <typename data_t, bool bdistributed>
 void Grid_FFT<data_t, bdistributed>::allocate(void)
 {
@ -175,6 +193,7 @@ void Grid_FFT<data_t, bdistributed>::allocate(void)
 #endif //// of #ifdef #else USE_MPI ////////////////////////////////////////////////////////////////////////////////////
    }
    ballocated_ = true;
+    memory_report();
 }

 template <typename data_t, bool bdistributed>
--- a/src/ic_generator.cc
+++ b/src/ic_generator.cc
@ -62,9 +62,9 @@ std::unique_ptr<cosmology::calculator>  the_cosmo_calc;
 */
 int initialise( config_file& the_config )
 {
-    the_random_number_generator = std::move(select_RNG_plugin(the_config));
+    the_random_number_generator = select_RNG_plugin(the_config);
    the_cosmo_calc              = std::make_unique<cosmology::calculator>(the_config);
-    the_output_plugin           = std::move(select_output_plugin(the_config, the_cosmo_calc));
+    the_output_plugin           = select_output_plugin(the_config, the_cosmo_calc);
    
    return 0;
 }
@ -108,6 +108,20 @@ int run( config_file& the_config )
    //! order of the LPT approximation 
    const int LPTorder = the_config.get_value_safe<double>("setup","LPTorder",100);

+    #if defined(USE_CONVOLVER_ORSZAG)
+        //! check if grid size is even for Orszag convolver
+        if( (ngrid%2 != 0) && (LPTorder>1) ){
+            music::elog << "ERROR: Orszag convolver for LPTorder>1 requires even grid size!" << std::endl;
+            throw std::runtime_error("Orszag convolver for LPTorder>1 requires even grid size!");
+            return 0;
+        }
+    #else
+        //! warn if Orszag convolver is not used
+        if( LPTorder>1 ){
+            music::wlog << "WARNING: LPTorder>1 requires USE_CONVOLVER_ORSZAG to be enabled to avoid aliased results!" << std::endl;
+        }
+    #endif
+
    //--------------------------------------------------------------------------------------------------------
    //! initialice particles on a bcc or fcc lattice instead of a standard sc lattice (doubles and quadruples the number of particles) 
    std::string lattice_str = the_config.get_value_safe<std::string>("setup","ParticleLoad","sc");
@ -116,19 +130,26 @@ int run( config_file& the_config )
        : ((lattice_str=="fcc")? particle::lattice_fcc 
        : ((lattice_str=="rsc")? particle::lattice_rsc 
        : ((lattice_str=="glass")? particle::lattice_glass
-        : particle::lattice_sc))));
+        : ((lattice_str=="masked")? particle::lattice_masked
+        : particle::lattice_sc)))));
+
+    music::ilog << "Using " << lattice_str << " lattice for particle load." << std::endl;

    //--------------------------------------------------------------------------------------------------------
    //! apply fixing of the complex mode amplitude following Angulo & Pontzen (2016) [https://arxiv.org/abs/1603.05253]
    const bool bDoFixing    = the_config.get_value_safe<bool>("setup", "DoFixing", false);
+    music::ilog << "Fixing of complex mode amplitudes is " << (bDoFixing?"enabled":"disabled") << std::endl;
+
    const bool bDoInversion = the_config.get_value_safe<bool>("setup", "DoInversion", false);
-    
+    music::ilog << "Inversion of the phase field is " << (bDoInversion?"enabled":"disabled") << std::endl;

    //--------------------------------------------------------------------------------------------------------
    //! do baryon ICs?
    const bool bDoBaryons = the_config.get_value_safe<bool>("setup", "DoBaryons", false );
+    music::ilog << "Baryon ICs are " << (bDoBaryons?"enabled":"disabled") << std::endl;
    //! enable also back-scaled decaying relative velocity mode? only first order!
    const bool bDoLinearBCcorr = the_config.get_value_safe<bool>("setup", "DoBaryonVrel", false);
+    music::ilog << "Baryon linear relative velocity mode is " << (bDoLinearBCcorr?"enabled":"disabled") << std::endl;
    // compute mass fractions 
    std::map< cosmo_species, double > Omega;
    if( bDoBaryons ){
@ -149,12 +170,12 @@ int run( config_file& the_config )
    //--------------------------------------------------------------------------------------------------------
    //! add beyond box tidal field modes following Schmidt et al. (2018) [https://arxiv.org/abs/1803.03274]
    bool bAddExternalTides = the_config.contains_key("cosmology", "LSS_aniso_lx") 
-                           & the_config.contains_key("cosmology", "LSS_aniso_ly") 
-                           & the_config.contains_key("cosmology", "LSS_aniso_lz");
+                           && the_config.contains_key("cosmology", "LSS_aniso_ly") 
+                           && the_config.contains_key("cosmology", "LSS_aniso_lz");

    if( bAddExternalTides && !(  the_config.contains_key("cosmology", "LSS_aniso_lx") 
-                               | the_config.contains_key("cosmology", "LSS_aniso_ly") 
-                               | the_config.contains_key("cosmology", "LSS_aniso_lz") ))
+                               || the_config.contains_key("cosmology", "LSS_aniso_ly") 
+                               || the_config.contains_key("cosmology", "LSS_aniso_lz") ))
    {
        music::elog << "Not all dimensions of LSS_aniso_l{x,y,z} specified! Will ignore external tidal field!" << std::endl;
        bAddExternalTides = false;
@ -353,10 +374,10 @@ int run( config_file& the_config )
    // phi = - delta / k^2

    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
-    music::ilog << "Generating LPT fields...." << std::endl;
+    music::ilog << "\n>>> Generating LPT fields.... <<<\n" << std::endl;

    double wtime = get_wtime();
-    music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(1) term" << std::flush;
+    music::ilog << std::setw(79) << std::setfill('.') << std::left << ">> Computing phi(1) term" << std::endl;

    phi.FourierTransformForward(false);
    phi.assign_function_of_grids_kdep([&](auto k, auto wn) {
@ -367,7 +388,7 @@ int run( config_file& the_config )

    phi.zero_DC_mode();

-    music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+    music::ilog << std::setw(70) << std::setfill(' ') << std::right << "took : " << std::setw(8) << get_wtime() - wtime << "s" << std::endl;

    //======================================================================
    //... compute 2LPT displacement potential ....
@ -378,7 +399,7 @@ int run( config_file& the_config )
        phi2.FourierTransformForward(false);
        
        wtime = get_wtime();
-        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(2) term" << std::flush;
+        music::ilog << std::setw(79) << std::setfill('.') << std::left << ">> Computing phi(2) term" << std::endl;
        Conv.convolve_SumOfHessians(phi, {0, 0}, phi, {1, 1}, {2, 2}, op::assign_to(phi2));
        Conv.convolve_Hessians(phi, {1, 1}, phi, {2, 2}, op::add_to(phi2));
        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, op::subtract_from(phi2));
@ -397,7 +418,7 @@ int run( config_file& the_config )
        }

        phi2.apply_InverseLaplacian();
-        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+        music::ilog << std::setw(70) << std::setfill(' ') << std::right << "took : " << std::setw(8) << get_wtime() - wtime << "s" << std::endl;

        if (bAddExternalTides)
        {
@ -418,19 +439,18 @@ int run( config_file& the_config )
        //... phi3 = phi3a - 10/7 phi3b
        //... 3a term ...
        wtime = get_wtime();
-        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3a) term" << std::flush;
+        music::ilog << std::setw(79) << std::setfill('.') << std::left << ">> Computing phi(3a) term" << std::endl;
        Conv.convolve_Hessians(phi, {0, 0}, phi, {1, 1}, phi, {2, 2}, op::assign_to(phi3));
        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 2}, phi, {1, 2}, op::multiply_add_to(phi3,2.0));
        Conv.convolve_Hessians(phi, {1, 2}, phi, {1, 2}, phi, {0, 0}, op::subtract_from(phi3));
        Conv.convolve_Hessians(phi, {0, 2}, phi, {0, 2}, phi, {1, 1}, op::subtract_from(phi3));
        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, phi, {2, 2}, op::subtract_from(phi3));
        // phi3a.apply_InverseLaplacian();
-        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+        music::ilog << std::setw(70) << std::setfill(' ') << std::right << "took : " << std::setw(8) << get_wtime() - wtime << "s" << std::endl;

        //... 3b term ...
        wtime = get_wtime();
-        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3b) term" << std::flush;
-        // phi3b.FourierTransformForward(false);
+        music::ilog << std::setw(71) << std::setfill('.') << std::left << ">> Computing phi(3b) term" << std::endl;
        Conv.convolve_SumOfHessians(phi, {0, 0}, phi2, {1, 1}, {2, 2}, op::multiply_add_to(phi3,-5.0/7.0));
        Conv.convolve_SumOfHessians(phi, {1, 1}, phi2, {2, 2}, {0, 0}, op::multiply_add_to(phi3,-5.0/7.0));
        Conv.convolve_SumOfHessians(phi, {2, 2}, phi2, {0, 0}, {1, 1}, op::multiply_add_to(phi3,-5.0/7.0));
@ -438,12 +458,11 @@ int run( config_file& the_config )
        Conv.convolve_Hessians(phi, {0, 2}, phi2, {0, 2}, op::multiply_add_to(phi3,+10.0/7.0));
        Conv.convolve_Hessians(phi, {1, 2}, phi2, {1, 2}, op::multiply_add_to(phi3,+10.0/7.0));
        phi3.apply_InverseLaplacian();
-        //phi3b *= 0.5; // factor 1/2 from definition of phi(3b)!
-        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+        music::ilog << std::setw(70) << std::setfill(' ') << std::right << "took : " << std::setw(8) << get_wtime() - wtime << "s" << std::endl;

        //... transversal term ...
        wtime = get_wtime();
-        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing A(3) term" << std::flush;
+        music::ilog << std::setw(71) << std::setfill('.') << std::left << ">> Computing A(3) term" << std::endl;
        for (int idim = 0; idim < 3; ++idim)
        {
            // cyclic rotations of indices
@ -456,7 +475,7 @@ int run( config_file& the_config )
            Conv.convolve_DifferenceOfHessians(phi2, {idimp, idimpp}, phi, {idimp, idimp}, {idimpp, idimpp}, op::subtract_from(*A3[idim]));
            A3[idim]->apply_InverseLaplacian();
        }
-        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+        music::ilog << std::setw(70) << std::setfill(' ') << std::right << "took : " << std::setw(8) << get_wtime() - wtime << "s" << std::endl;
    }

    ///... scale all potentials with respective growth factors
@ -536,8 +555,11 @@ int run( config_file& the_config )
                size_t IDoffset = (this_species == cosmo_species::baryon)? ((the_output_plugin->has_64bit_ids())? 1 : 1): 0 ;

                // allocate particle structure and generate particle IDs
+                bool secondary_lattice = (this_species == cosmo_species::baryon &&
+                                        the_output_plugin->write_species_as(this_species) == output_type::particles) ? true : false;
+
                particle_lattice_generator_ptr = 
-                std::make_unique<particle::lattice_generator<Grid_FFT<real_t>>>( lattice_type, the_output_plugin->has_64bit_reals(), the_output_plugin->has_64bit_ids(), 
+                std::make_unique<particle::lattice_generator<Grid_FFT<real_t>>>( lattice_type, secondary_lattice, the_output_plugin->has_64bit_reals(), the_output_plugin->has_64bit_ids(), 
                    bDoBaryons, IDoffset, tmp, the_config );
            }

@ -545,7 +567,7 @@ int run( config_file& the_config )
            if( bDoBaryons && (the_output_plugin->write_species_as( this_species ) == output_type::particles
                || the_output_plugin->write_species_as( this_species ) == output_type::field_lagrangian) ) 
            {
-                bool shifted_lattice = (this_species == cosmo_species::baryon &&
+                bool secondary_lattice = (this_species == cosmo_species::baryon &&
                                        the_output_plugin->write_species_as(this_species) == output_type::particles) ? true : false;

                const real_t munit = the_output_plugin->mass_unit();
@ -568,7 +590,7 @@ int run( config_file& the_config )
                });
                
                if( the_output_plugin->write_species_as( this_species ) == output_type::particles ){
-                    particle_lattice_generator_ptr->set_masses( lattice_type, shifted_lattice, 1.0, the_output_plugin->has_64bit_reals(), rho, the_config );
+                    particle_lattice_generator_ptr->set_masses( lattice_type, secondary_lattice, 1.0, the_output_plugin->has_64bit_reals(), rho, the_config );
                }else if( the_output_plugin->write_species_as( this_species ) == output_type::field_lagrangian ){
                    the_output_plugin->write_grid_data( rho, this_species, fluid_component::mass );
                }
--- a/src/main.cc
+++ b/src/main.cc
@ -43,8 +43,11 @@ bool FFTW_threads_ok = false;
 int  num_threads = 1;
 }

+size_t global_mem_high_mark, local_mem_high_mark;

 #include "system_stat.hh"
+#include "memory_stat.hh"
+

 #include <exception>
 #include <stdexcept>
@ -76,6 +79,8 @@ int main( int argc, char** argv )
    music::logger::set_level(music::log_level::debug);
 #endif

+    global_mem_high_mark = local_mem_high_mark = 0;
+
    //------------------------------------------------------------------------------
    // initialise MPI 
    //------------------------------------------------------------------------------
@ -259,13 +264,25 @@ int main( int argc, char** argv )
    ic_generator::reset();
    ///////////////////////////////////////////////////////////////////////

+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    size_t peak_mem = memory::getPeakRSS();
+#if defined(USE_MPI)
+    size_t peak_mem_max{0};
+    MPI_Allreduce(&peak_mem, &peak_mem_max, 1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, MPI_COMM_WORLD);
+    peak_mem = peak_mem_max;
+#endif
+
+    if( peak_mem > (1ull<<30) )
+        music::ilog << "Peak memory usage was " << peak_mem /(1ull<<30) << " GBytes / task" << std::endl;
+    else 
+        music::ilog << "Peak memory usage was " << peak_mem /(1ull<<20) << " MBytes / task" << std::endl;
+

 #if defined(USE_MPI)
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
 #endif
-
-    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        
    music::ilog << "Done. Have a nice day!\n" << std::endl;

    return 0;
--- a/src/output_plugin.cc
+++ b/src/output_plugin.cc
@ -74,7 +74,7 @@ std::unique_ptr<output_plugin> select_output_plugin( config_file& cf, std::uniqu
 		music::ilog << std::setw(32) << std::left << "Output plugin" << " : " << formatname << std::endl;
 	}
 	
-	return std::move(the_output_plugin_creator->create( cf, pcc ));
+	return the_output_plugin_creator->create( cf, pcc );
 }


--- a/src/plugins/output_arepo.cc
+++ b/src/plugins/output_arepo.cc
@ -33,7 +33,7 @@ std::vector<T> from_value(const T a)
 }

 template <typename write_real_t>
-class gadget_hdf5_output_plugin : public output_plugin
+class arepo_output_plugin : public output_plugin
 {
  struct header_t
  {
@ -73,7 +73,7 @@ protected:

 public:
  //! constructor
-  explicit gadget_hdf5_output_plugin(config_file &cf, std::unique_ptr<cosmology::calculator>& pcc)
+  explicit arepo_output_plugin(config_file &cf, std::unique_ptr<cosmology::calculator>& pcc)
      : output_plugin(cf, pcc, "AREPO-HDF5")
  {
    num_files_ = 1;
@ -148,7 +148,7 @@ public:
  }

  // use destructor to write header post factum
-  ~gadget_hdf5_output_plugin()
+  ~arepo_output_plugin()
  {
    HDFCreateGroup(this_fname_, "Header");
    HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<unsigned>(header_.npart));
@ -271,9 +271,9 @@ public:
 namespace
 {
 #if !defined(USE_SINGLEPRECISION)
-output_plugin_creator_concrete<gadget_hdf5_output_plugin<double>> creator1("AREPO");
+output_plugin_creator_concrete<arepo_output_plugin<double>> creator880("AREPO");
 #else
-output_plugin_creator_concrete<gadget_hdf5_output_plugin<float>> creator1("AREPO");
+output_plugin_creator_concrete<arepo_output_plugin<float>> creator881("AREPO");
 #endif
 } // namespace

--- a/src/plugins/output_gadget_hdf5.cc
+++ b/src/plugins/output_gadget_hdf5.cc
@ -2,17 +2,17 @@
 // This file is part of monofonIC (MUSIC2)
 // A software package to generate ICs for cosmological simulations
 // Copyright (C) 2020 by Oliver Hahn
-// 
+//
 // monofonIC is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
-// 
+//
 // monofonIC is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
-// 
+//
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #ifdef USE_HDF5
@ -38,12 +38,14 @@ class gadget_hdf5_output_plugin : public output_plugin
  struct header_t
  {
    unsigned npart[6];
+    size_t npart64[6];
    double mass[6];
    double time;
    double redshift;
    int flag_sfr;
    int flag_feedback;
    unsigned int npartTotal[6];
+    size_t npartTotal64[6];
    int flag_cooling;
    int num_files;
    double BoxSize;
@ -62,12 +64,13 @@ protected:
  header_t header_;
  real_t lunit_, vunit_, munit_;
  bool blongids_;
+  bool bgadget2_compatibility_;
  std::string this_fname_;

 public:
  //! constructor
  explicit gadget_hdf5_output_plugin(config_file &cf, std::unique_ptr<cosmology::calculator> &pcc)
-      : output_plugin(cf, pcc, "GADGET-HDF5")
+      : output_plugin(cf, pcc, (std::string("GADGET-HDF5-")+typeid(write_real_t).name()).c_str() )
  {
    num_files_ = 1;
 #ifdef USE_MPI
@ -84,11 +87,16 @@ public:
    blongids_ = cf_.get_value_safe<bool>("output", "UseLongids", false);
    num_simultaneous_writers_ = cf_.get_value_safe<int>("output", "NumSimWriters", num_files_);

+    bgadget2_compatibility_ = cf_.get_value_safe<bool>("output", "Gadget2Compatibility", false);
+    music::ilog << std::setw(32) << std::left << "Gadget2Compatibility" << " : " << (bgadget2_compatibility_? "yes" : "no") << std::endl;
+
    for (int i = 0; i < 6; ++i)
    {
      header_.npart[i] = 0;
+      header_.npart64[i] = 0;
      header_.npartTotal[i] = 0;
      header_.npartTotalHighWord[i] = 0;
+      header_.npartTotal64[i] = 0;
      header_.mass[i] = 0.0;
    }

@ -107,31 +115,84 @@ public:
    header_.flag_entropy_instead_u = 0;
    header_.flag_doubleprecision = (typeid(write_real_t) == typeid(double)) ? true : false;

-    this_fname_ = fname_;
+    // split fname into prefix and file suffix (at last dot)
+    std::string::size_type pos = fname_.find_last_of(".");
+    std::string fname_prefix = fname_.substr(0, pos);
+    std::string fname_suffix = fname_.substr(pos + 1);
+
+    // add rank to filename if we have more than one file
+    this_fname_ = fname_prefix;
 #ifdef USE_MPI
    int thisrank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &thisrank);
    if (num_files_ > 1)
      this_fname_ += "." + std::to_string(thisrank);
 #endif
+    this_fname_ += "." + fname_suffix;

    unlink(this_fname_.c_str());
    HDFCreateFile(this_fname_);
+
+    // Write MUSIC configuration header
+    int order = cf_.get_value<int>("setup", "LPTorder");
+    std::string load = cf_.get_value<std::string>("setup", "ParticleLoad");
+    std::string tf = cf_.get_value<std::string>("cosmology", "transfer");
+    std::string cosmo_set = cf_.get_value<std::string>("cosmology", "ParameterSet");
+    std::string rng = cf_.get_value<std::string>("random", "generator");
+    int do_fixing = cf_.get_value<bool>("setup", "DoFixing");
+    int do_invert = cf_.get_value<bool>("setup", "DoInversion");
+    int do_baryons = cf_.get_value<bool>("setup", "DoBaryons");
+    int do_baryonsVrel = cf_.get_value<bool>("setup", "DoBaryonVrel");
+    int L = cf_.get_value<int>("setup", "GridRes");
+
+    HDFCreateGroup(this_fname_, "ICs_parameters");
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Code", std::string("MUSIC2 - monofonIC"));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Git Revision", std::string(GIT_REV));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Git Tag", std::string(GIT_TAG));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Git Branch", std::string(GIT_BRANCH));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Precision", std::string(CMAKE_PRECISION_STR));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Convolutions", std::string(CMAKE_CONVOLVER_STR));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "PLT", std::string(CMAKE_PLT_STR));
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "LPT Order", order);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Particle Load", load);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Transfer Function", tf);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Cosmology Parameter Set", cosmo_set);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Random Generator", rng);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Mode Fixing", do_fixing);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Mode inversion", do_invert);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Baryons", do_baryons);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Baryons Relative Velocity", do_baryonsVrel);
+    HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Grid Resolution", L);
+
+    if (tf == "CLASS") {
+      double ztarget = cf_.get_value<double>("cosmology", "ztarget");
+      HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Target Redshift", ztarget);
+    }
+    if (rng == "PANPHASIA") {
+      std::string desc = cf_.get_value<std::string>("random", "descriptor");
+      HDFWriteGroupAttribute(this_fname_, "ICs_parameters", "Descriptor", desc);
+    }
  }

  // use destructor to write header post factum
  ~gadget_hdf5_output_plugin()
  {
-    if (!std::uncaught_exception()) 
-    {   
+    if (!std::uncaught_exception())
+    {
      HDFCreateGroup(this_fname_, "Header");
-      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<unsigned>(header_.npart));
+      if( bgadget2_compatibility_ ){
+        HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<unsigned>(header_.npart));
+        HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total", from_6array<unsigned>(header_.npartTotal));
+        HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total_HighWord", from_6array<unsigned>(header_.npartTotalHighWord));
+      }else{
+        HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<size_t>(header_.npart64));
+        HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total", from_6array<size_t>(header_.npartTotal64));
+      }
      HDFWriteGroupAttribute(this_fname_, "Header", "MassTable", from_6array<double>(header_.mass));
      HDFWriteGroupAttribute(this_fname_, "Header", "Time", from_value<double>(header_.time));
      HDFWriteGroupAttribute(this_fname_, "Header", "Redshift", from_value<double>(header_.redshift));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Sfr", from_value<int>(header_.flag_sfr));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Feedback", from_value<int>(header_.flag_feedback));
-      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total", from_6array<unsigned>(header_.npartTotal));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Cooling", from_value<int>(header_.flag_cooling));
      HDFWriteGroupAttribute(this_fname_, "Header", "NumFilesPerSnapshot", from_value<int>(header_.num_files));
      HDFWriteGroupAttribute(this_fname_, "Header", "BoxSize", from_value<double>(header_.BoxSize));
@ -140,10 +201,17 @@ public:
      HDFWriteGroupAttribute(this_fname_, "Header", "HubbleParam", from_value<double>(header_.HubbleParam));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_StellarAge", from_value<int>(header_.flag_stellarage));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Metals", from_value<int>(header_.flag_metals));
-      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total_HighWord", from_6array<unsigned>(header_.npartTotalHighWord));
      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Entropy_ICs", from_value<int>(header_.flag_entropy_instead_u));

      music::ilog << "Wrote Gadget-HDF5 file(s) to " << this_fname_ << std::endl;
+
+      music::ilog << "You can use the following values in param.txt:" << std::endl;
+      music::ilog << "Omega0       " << header_.Omega0 << std::endl;
+      music::ilog << "OmegaLambda  " << header_.OmegaLambda << std::endl;
+      music::ilog << "OmegaBaryon  " << pcc_->cosmo_param_["Omega_b"] << std::endl;
+      music::ilog << "HubbleParam  " << header_.HubbleParam << std::endl;
+      music::ilog << "Hubble       100.0" <<  std::endl;
+      music::ilog << "BoxSize      " << header_.BoxSize <<  std::endl;
    }
  }

@ -189,10 +257,15 @@ public:

    assert(sid != -1);

+    // use 32 bit integers for Gadget-2 compatibility
    header_.npart[sid] = (pc.get_local_num_particles());
    header_.npartTotal[sid] = (uint32_t)(pc.get_global_num_particles());
    header_.npartTotalHighWord[sid] = (uint32_t)((pc.get_global_num_particles()) >> 32);

+    // use 64 bit integers for Gadget >2 compatibility
+    header_.npart64[sid] = pc.get_local_num_particles();
+    header_.npartTotal64[sid] = pc.get_global_num_particles();
+
    if( pc.bhas_individual_masses_ )
      header_.mass[sid] = 0.0;
    else
@ -233,10 +306,10 @@ public:

 namespace
 {
-output_plugin_creator_concrete<gadget_hdf5_output_plugin<float>> creator1("gadget_hdf5");
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<float>> creator991("gadget_hdf5");
 #if !defined(USE_SINGLEPRECISION)
-output_plugin_creator_concrete<gadget_hdf5_output_plugin<double>> creator3("gadget_hdf5_double");
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<double>> creator992("gadget_hdf5_double");
 #endif
 } // namespace

-#endif
+#endif
--- a/src/plugins/output_generic.cc
+++ b/src/plugins/output_generic.cc
@ -123,7 +123,7 @@ void generic_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos

 namespace
 {
-   output_plugin_creator_concrete<generic_output_plugin> creator1("generic"); 
+   output_plugin_creator_concrete<generic_output_plugin> creator001("generic"); 
 } // namespace

 #endif
--- a/src/plugins/output_genericio.cc
+++ b/src/plugins/output_genericio.cc
@ -157,7 +157,7 @@ public:

 namespace
 {
-output_plugin_creator_concrete<genericio_output_plugin> creator1("genericio");
+output_plugin_creator_concrete<genericio_output_plugin> creator101("genericio");
 } // namespace

 #endif // ENABLE_GENERICIO
--- a/src/plugins/output_grafic2.cc
+++ b/src/plugins/output_grafic2.cc
@ -212,7 +212,7 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
        unlink(file_name.c_str());
    }

-    std::ofstream *pofs;
+    std::ofstream *pofs = nullptr;

    // write header or seek to end of file
    if (CONFIG::MPI_task_rank == 0)
@ -323,5 +323,5 @@ void grafic2_output_plugin::write_ramses_namelist(void) const

 namespace
 {
-  output_plugin_creator_concrete<grafic2_output_plugin> creator1("grafic2");
+  output_plugin_creator_concrete<grafic2_output_plugin> creator201("grafic2");
 } // namespace
--- a/src/plugins/output_swift.cc
+++ b/src/plugins/output_swift.cc
@ -369,7 +369,7 @@ public:

 namespace
 {
-output_plugin_creator_concrete<swift_output_plugin<double>> creator1("SWIFT");
+output_plugin_creator_concrete<swift_output_plugin<double>> creator301("SWIFT");
 } // namespace

 #endif
--- a/src/plugins/random_music.cc
+++ b/src/plugins/random_music.cc
@ -180,7 +180,7 @@ void RNG_music::parse_random_parameters(void)
        //... generate some dummy seed which only depends on the level, negative so we know it's not
        //... an actual seed and thus should not be used as a constraint for coarse levels
        // rngseeds_.push_back( -abs((unsigned)(ltemp-i)%123+(unsigned)(ltemp+827342523521*i)%123456789) );
-        rngseeds_.push_back(-abs((long)(ltemp - i) % 123 + (long)(ltemp + 7342523521 * i) % 123456789));
+        rngseeds_.push_back(-std::abs((long)(ltemp - i) % 123 + (long)(ltemp + 7342523521 * i) % 123456789));
      else
      {
        if (ltemp <= 0)
@ -357,4 +357,4 @@ void RNG_music::compute_random_numbers(void)
 namespace
 {
 RNG_plugin_creator_concrete<RNG_music> creator("MUSIC1");
-}
+}
--- a/src/plugins/transfer_CLASS.cc
+++ b/src/plugins/transfer_CLASS.cc
@ -163,7 +163,7 @@ private:
    music::ilog << "Computing transfer function via ClassEngine..." << std::endl;
    double wtime = get_wtime();

-    the_ClassEngine_ = std::move(std::make_unique<ClassEngine>(pars_, false));
+    the_ClassEngine_ = std::make_unique<ClassEngine>(pars_, false);

    wtime = get_wtime() - wtime;
    music::ilog << "CLASS took " << wtime << " s." << std::endl;
--- a/src/random_plugin.cc
+++ b/src/random_plugin.cc
@ -77,5 +77,5 @@ std::unique_ptr<RNG_plugin> select_RNG_plugin(config_file &cf)
        music::ilog << std::setw(32) << std::left << "Random number generator plugin" << " : " << rngname << std::endl;
    }

-    return std::move(the_RNG_plugin_creator->Create(cf));
+    return the_RNG_plugin_creator->Create(cf);
 }
--- a/src/transfer_function_plugin.cc
+++ b/src/transfer_function_plugin.cc
@ -75,5 +75,5 @@ std::unique_ptr<TransferFunction_plugin> select_TransferFunction_plugin(config_f
        music::ilog << std::setw(32) << std::left << "Transfer function plugin" << " : " << tfname << std::endl;
    }

-    return std::move(the_TransferFunction_plugin_creator->create(cf, cosmo_param));
+    return the_TransferFunction_plugin_creator->create(cf, cosmo_param);
 }