Merge branch 'develop' into 'master'

Develop See merge request ohahn/fastlpt!3
2024-09-19 17:03:45 +02:00 · 2020-05-06 13:03:42 +02:00 · 2020-05-06 13:03:42 +02:00 · 38320d2150
commit 38320d2150
parent a58fbc9778 0937242a1b
61 changed files with 9993 additions and 1922 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,56 +1,14 @@
-build
+.DS_Store
 .vscode
-src/CMakeFiles/3.12.2/CompilerIdC/CMakeCCompilerId.c
-src/CMakeFiles/feature_tests.c
-src/CMakeFiles/feature_tests.cxx
-src/CMakeFiles/progress.marks
-src/CMakeFiles/3.12.2/CMakeCCompiler.cmake
-src/CMakeFiles/3.12.2/CMakeCXXCompiler.cmake
-src/CMakeFiles/3.12.2/CMakeDetermineCompilerABI_C.bin
-src/CMakeFiles/3.12.2/CMakeDetermineCompilerABI_CXX.bin
-src/CMakeFiles/3.12.2/CMakeSystem.cmake
-src/CMakeFiles/fastLPT.dir/build.make
-src/CMakeFiles/FindMPI/test_mpi.cpp
-src/CMakeFiles/FindMPI/test_mpi_C.bin
-src/CMakeFiles/FindMPI/test_mpi_CXX.bin
-src/CMakeFiles/FindOpenMP/OpenMPCheckVersion.c
-src/CMakeFiles/FindOpenMP/OpenMPCheckVersion.cpp
-src/CMakeFiles/FindOpenMP/OpenMPTryFlag.c
-src/CMakeFiles/FindOpenMP/OpenMPTryFlag.cpp
-src/CMakeFiles/FindOpenMP/ompver_C.bin
-src/CMakeFiles/FindOpenMP/ompver_CXX.bin
-src/CMakeFiles/fastLPT.dir/CXX.includecache
-src/CMakeFiles/fastLPT.dir/DependInfo.cmake
-src/CMakeFiles/fastLPT.dir/plugins/transfer_eisenstein.cc.o
-src/CMakeFiles/3.12.2/CompilerIdCXX/a.out
-src/CMakeFiles/fastLPT.dir/cmake_clean.cmake
-src/CMakeFiles/fastLPT.dir/depend.internal
-src/CMakeFiles/fastLPT.dir/depend.make
-src/CMakeFiles/fastLPT.dir/flags.make
-src/CMakeFiles/fastLPT.dir/grid_fft.cc.o
-src/CMakeFiles/fastLPT.dir/link.txt
-src/CMakeFiles/fastLPT.dir/logger.cc.o
-src/CMakeFiles/fastLPT.dir/main.cc.o
-src/CMakeFiles/fastLPT.dir/progress.make
-src/CMakeFiles/fastLPT.dir/random_plugin.cc.o
-src/CMakeFiles/fastLPT.dir/transfer_function_plugin.cc.o
-src/CMakeFiles/fastLPT.dir/plugins/random_music.cc.o
-src/CMakeFiles/fastLPT.dir/plugins/random_music_wnoise_generator.cc.o
-src/CMakeFiles/feature_tests.bin
-src/CMakeFiles/CMakeDirectoryInformation.cmake
-src/CMakeFiles/CMakeOutput.log
-src/CMakeFiles/Makefile.cmake
-src/CMakeFiles/Makefile2
-src/CMakeFiles/TargetDirectories.txt
-src/CMakeFiles/cmake.check_cache
-src/CMakeFiles/3.12.2/CompilerIdC/a.out
-src/CMakeFiles/3.12.2/CompilerIdCXX/CMakeCXXCompilerId.cpp
-src/CMakeFiles/hdf5/cmake_hdf5_test.c
-src/fastLPT.dSYM/Contents/Info.plist
-src/fastLPT.dSYM/Contents/Resources/DWARF/fastLPT
+build
+include/cmake_config.hh
+src/input_powerspec.txt
+CMakeCache.txt
+CMakeFiles/cmake.check_cache
+src/CMakeFiles
 src/cmake_install.cmake
 src/CMakeCache.txt
-src/fastLPT
-src/input_powerspec.txt
 src/Makefile
-.DS_Store
+external/panphasia/rand_base.mod
+external/panphasia/rand_int.mod
+external/panphasia/rand.mod
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,16 +1,42 @@
 cmake_minimum_required(VERSION 3.9)
 set(PRGNAME monofonIC)
-project(monofonIC)

+project(monofonIC C CXX)
+
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -Wall -fno-omit-frame-pointer -g  -fsanitize=address")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native -Wall -pedantic" CACHE STRING "Flags used by the compiler during Release builds." FORCE)
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -march=native -fno-omit-frame-pointer -Wall -pedantic" CACHE STRING "Flags used by the compiler during RelWithDebInfo builds." FORCE)
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O1 -march=native -DDEBUG -fno-omit-frame-pointer -Wall -pedantic" CACHE STRING "Flags used by the compiler during Debug builds." FORCE)
+set(CMAKE_CXX_FLAGS_DEBUGSANADD "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address " CACHE STRING "Flags used by the compiler during Debug builds with Sanitizer for address." FORCE)
+set(CMAKE_CXX_FLAGS_DEBUGSANUNDEF "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=undefined" CACHE STRING "Flags used by the compiler during Debug builds with Sanitizer for undefineds." FORCE)
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the compiler during Release builds." FORCE)
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the compiler during RelWithDebInfo builds." FORCE)
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the compiler during Debug builds." FORCE)
+set(CMAKE_C_FLAGS_DEBUGSANADD "${CMAKE_CXX_FLAGS_DEBUGSANADD}" CACHE STRING "Flags used by the compiler during Debug builds with Sanitizer for address." FORCE)
+set(CMAKE_C_FLAGS_DEBUGSANUNDEF "${CMAKE_CXX_FLAGS_DEBUGSANUNDEF}" CACHE STRING "Flags used by the compiler during Debug builds with Sanitizer for undefineds." FORCE)
+
+
+set(default_build_type "Release")
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "RelWithDebInfo" "DebugSanAdd" "DebugSanUndef")
+endif()
+mark_as_advanced(CMAKE_CXX_FLAGS_DEBUGSANADD CMAKE_CXX_FLAGS_DEBUGSANUNDEF)
+mark_as_advanced(CMAKE_C_FLAGS_DEBUGSANADD CMAKE_C_FLAGS_DEBUGSANUNDEF)
+mark_as_advanced(CMAKE_EXECUTABLE_FORMAT CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT)
+
+
+########################################################################################################################
 # include class submodule
 include(${CMAKE_CURRENT_SOURCE_DIR}/external/class.cmake)

-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -march=native -Wall -fno-omit-frame-pointer -g  -fsanitize=address")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -Wall -pedantic")
 find_package(PkgConfig REQUIRED)

-set(CMAKE_MODULE_PATH
-        "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}")
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}")


 ########################################################################################################################
@ -48,21 +74,70 @@ if(ENABLE_MPI)
  endif(MPI_CXX_FOUND)
 endif(ENABLE_MPI)

+########################################################################################################################
+# floating point precision
+set (
+  CODE_PRECISION "DOUBLE"
+  CACHE STRING "Floating point type used for internal computations and FFTs"
+)
+set_property (
+  CACHE CODE_PRECISION
+  PROPERTY STRINGS FLOAT DOUBLE LONGDOUBLE
+)

+########################################################################################################################
+# convolver type, right now only orszag or naive
+set (
+  CONVOLVER_TYPE "ORSZAG"
+  CACHE STRING "Convolution algorithm to be used (Naive=no dealiasing, Orszag=dealiased)"
+)
+set_property (
+  CACHE CONVOLVER_TYPE
+  PROPERTY STRINGS ORSZAG NAIVE
+)
+
+########################################################################################################################
+# PLT options, right now only on/off
+option(ENABLE_PLT "Enable PLT (particle linear theory) corrections" OFF)
+
+
+########################################################################################################################
 # FFTW
-cmake_policy(SET CMP0074 NEW)
+if(POLICY CMP0074)
+    cmake_policy(SET CMP0074 NEW)
+endif()
 if(ENABLE_MPI)
-  find_package(FFTW3 COMPONENTS SINGLE DOUBLE OPENMP THREADS MPI)
+  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS MPI)
 else()
-  find_package(FFTW3 COMPONENTS SINGLE DOUBLE OPENMP THREADS)
+  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS)
 endif(ENABLE_MPI)
+mark_as_advanced(FFTW3_SINGLE_MPI_LIBRARY FFTW3_SINGLE_OPENMP_LIBRARY FFTW3_SINGLE_SERIAL_LIBRARY FFTW3_SINGLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_DOUBLE_MPI_LIBRARY FFTW3_DOUBLE_OPENMP_LIBRARY FFTW3_DOUBLE_SERIAL_LIBRARY FFTW3_DOUBLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_LONGDOUBLE_MPI_LIBRARY FFTW3_LONGDOUBLE_OPENMP_LIBRARY FFTW3_LONGDOUBLE_SERIAL_LIBRARY FFTW3_LONGDOUBLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_MPI_INCLUDE_DIR)
+mark_as_advanced(pkgcfg_lib_PC_FFTW_fftw3)

+########################################################################################################################
 # GSL
 find_package(GSL REQUIRED)
+mark_as_advanced(pkgcfg_lib_GSL_gsl pkgcfg_lib_GSL_gslcblas pkgcfg_lib_GSL_m)

+########################################################################################################################
 # HDF5
 find_package(HDF5 REQUIRED)
+mark_as_advanced(HDF5_C_LIBRARY_dl HDF5_C_LIBRARY_hdf5 HDF5_C_LIBRARY_m HDF5_C_LIBRARY_pthread HDF5_C_LIBRARY_z HDF5_C_LIBRARY_sz)

+########################################################################################################################
+# PANPHASIA
+option(ENABLE_PANPHASIA "Enable PANPHASIA random number generator" ON)
+if(ENABLE_PANPHASIA)
+enable_language(Fortran)
+if ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "Intel")
+  set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -132 -implicit-none")
+elseif("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU")
+  set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -ffixed-line-length-132 -fimplicit-none")
+endif()
+endif(ENABLE_PANPHASIA)
 ########################################################################################################################
 # INCLUDES
 include_directories(${PROJECT_SOURCE_DIR}/include)
@ -81,28 +156,68 @@ file( GLOB PLUGINS
  ${PROJECT_SOURCE_DIR}/src/plugins/*.cc
 )

+if(ENABLE_PANPHASIA)
+list (APPEND SOURCES 
+  ${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f
+  ${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90
+)
+endif()
+
+# project configuration header
+configure_file(
+  ${PROJECT_SOURCE_DIR}/include/cmake_config.hh.in
+  ${PROJECT_SOURCE_DIR}/include/cmake_config.hh
+)
+
 add_executable(${PRGNAME} ${SOURCES} ${PLUGINS})
 target_setup_class(${PRGNAME})

-set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 14)
+

 # mpi flags
 if(MPI_CXX_FOUND)
-  if(FFTW3_DOUBLE_MPI_FOUND)
-    target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_MPI_LIBRARY})
-    target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIR_PARALLEL})
-    target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_MPI")
-  endif(FFTW3_DOUBLE_MPI_FOUND)
+  if(CODE_PRECISION STREQUAL "FLOAT")
+    if(FFTW3_SINGLE_MPI_FOUND)
+      target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_MPI_LIBRARY})
+      target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIR_PARALLEL})
+      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_MPI")
+    else()
+      message(SEND_ERROR "MPI enabled but FFTW3 library not found with MPI support for single precision!")
+    endif()
+  elseif(CODE_PRECISION STREQUAL "DOUBLE")
+    if(FFTW3_DOUBLE_MPI_FOUND)
+      target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_MPI_LIBRARY})
+      target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIR_PARALLEL})
+      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_MPI")
+    else()
+      message(SEND_ERROR "MPI enabled but FFTW3 library not found with MPI support for double precision!")
+    endif()
+  elseif(CODE_PRECISION STREQUAL "LONGDOUBLE")
+    if(FFTW3_LONGDOUBLE_MPI_FOUND)
+      target_link_libraries(${PRGNAME} ${FFTW3_LONGDOUBLE_MPI_LIBRARY})
+      target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIR_PARALLEL})
+      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_MPI")
+    else()
+      message(SEND_ERROR "MPI enabled but FFTW3 library not found with MPI support for long double precision!")
+    endif()
+  endif()

  target_include_directories(${PRGNAME} PRIVATE ${MPI_CXX_INCLUDE_PATH})
  target_compile_options(${PRGNAME} PRIVATE "-DUSE_MPI")
  target_link_libraries(${PRGNAME} ${MPI_LIBRARIES})
 endif(MPI_CXX_FOUND)

-if(FFTW3_DOUBLE_THREADS_FOUND) 
+if(CODE_PRECISION STREQUAL "FLOAT" AND FFTW3_SINGLE_THREADS_FOUND) 
+  target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_THREADS_LIBRARY})
+  target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
+elseif(CODE_PRECISION STREQUAL "DOUBLE" AND FFTW3_DOUBLE_THREADS_FOUND) 
  target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_THREADS_LIBRARY})
  target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
-endif(FFTW3_DOUBLE_THREADS_FOUND)
+elseif(CODE_PRECISION STREQUAL "LONGDOUBLE" AND FFTW3_LONGDOUBLE_THREADS_FOUND) 
+  target_link_libraries(${PRGNAME} ${FFTW3_LONGDOUBLE_THREADS_LIBRARY})
+  target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
+endif()

 if(HDF5_FOUND)
  # target_link_libraries(${PRGNAME} ${HDF5_C_LIBRARY_DIRS})
@ -111,6 +226,10 @@ if(HDF5_FOUND)
  target_compile_options(${PRGNAME} PRIVATE "-DUSE_HDF5")
 endif(HDF5_FOUND)

+if(ENABLE_PANPHASIA)
+target_compile_options(${PRGNAME} PRIVATE "-DUSE_PANPHASIA")
+endif(ENABLE_PANPHASIA)
+
 target_link_libraries(${PRGNAME} ${FFTW3_LIBRARIES})
 target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIRS})

--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ High order LPT/QPT tool for single resolution simulations
 ## Build Instructions
 Clone code including submodules (currently only CLASS is used as a submodule):

-    git clone --recurse-submodules https://ohahn@bitbucket.org/ohahn/monofonic.git
+    git clone --recurse-submodules https://<username>@bitbucket.org/ohahn/monofonic.git


 Create build directory, configure, and build:
@ -17,4 +17,30 @@ Create build directory, configure, and build:
    make

 this should create an executable in the build directory. 
-There is an example parameter file 'example.conf' in the main directory
+
+If you run into problems with CMake not being able to find your local FFTW3 or HDF5 installation, it is best to give the path directly as
+
+    FFTW3_ROOT=<path> HDF5_ROOT=<path> ccmake ..
+
+make sure to delete previous files generated by CMake before reconfiguring like this.
+
+If you want to build on macOS, then it is strongly recommended to use GNU (or Intel) compilers instead of Apple's Clang. Install them e.g. 
+via homebrew and then configure cmake to use them instead of the macOS default compiler via
+
+    CC=gcc-9 CXX=g++-9 ccmake ..
+    
+This is necessary since Apple's compilers haven't supported OpenMP for years.
+
+## Running
+
+There is an example parameter file 'example.conf' in the main directory. Possible options are explained in it, it can be run
+as a simple argument, e.g. from within the build directory:
+
+     ./monofonic ../example.conf
+
+If you want to run with MPI, you need to enable MPI support via ccmake. Then you can launch in hybrid MPI+threads mode by 
+specifying the desired number of threads per task in the config file, and the number of tasks to be launched via
+
+     mpirun -np 16 ./monofonic <path to config file>
+     
+It will then run with 16 tasks times the number of threads per task specified in the config file.
--- a/example.conf
+++ b/example.conf
@ -1,58 +1,71 @@
 [setup]
 # number of grid cells per linear dimension for calculations = particles for sc initial load
-GridRes      = 128
+GridRes         = 128
 # length of the box in Mpc/h
-BoxLength    = 250
+BoxLength       = 125
 # starting redshift
-zstart       = 49.0
+zstart          = 49.0
 # order of the LPT to be used (1,2 or 3)
-LPTorder     = 3
+LPTorder        = 1
 # also do baryon ICs?
-DoBaryons    = no
+DoBaryons       = no
 # do mode fixing à la Angulo&Pontzen
-DoFixing     = no
+DoFixing        = yes
 # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x) (increases number of particles by factor!)
-ParticleLoad = sc
+ParticleLoad    = sc
+# Add a possible constraint field here:
+#ConstraintFieldFile = initial_conditions.h5
+#ConstraintFieldName = ic_white_noise
+
+[cosmology]
+transfer        = CLASS
+ztarget         = 2.5
+# transfer        = eisenstein
+# transfer        = file_CAMB
+# transfer_file   = wmap5_transfer_out_z0.dat
+Omega_m         = 0.302
+Omega_b         = 0.045
+Omega_L         = 0.698
+H0              = 70.3
+sigma_8         = 0.811
+nspec           = 0.961
+
+# anisotropic large scale tidal field
+# LSS_aniso_lx    = +0.1
+# LSS_aniso_ly    = +0.1
+# LSS_aniso_lz    = -0.2
+
+[random]
+generator       = NGENIC
+seed            = 9001

 [testing]
 # enables diagnostic output
 # can be 'none' (default), 'potentials_and_densities', 'velocity_displacement_symmetries', or 'convergence'
-test = convergence
+test            = none

 [execution]
-NumThreads   = 4
+NumThreads      = 8

 [output]
-fname_hdf5   = output_sch.hdf5
-fbase_analysis = output
+fname_hdf5      = output_sch.hdf5
+fbase_analysis  = output

-format       = gadget2
-filename     = ics_gadget.dat
+# format          = gadget2
+# filename        = ics_gadget.dat
+# UseLongids      = false

-#format       = generic
-#filename     = debug.hdf5
-#generic_out_eulerian = yes
+format          = gadget_hdf5
+filename        = ics_gadget.hdf5

-#format	       = grafic2
-#filename       = ics_ramses
-#grafic_use_SPT = yes
+# format          = AREPO
+# filename        = ics_arepo.hdf5

-[random]
-generator    = NGENIC
-seed         = 9001
+# format          = generic
+# filename        = debug.hdf5
+# generic_out_eulerian = yes

-[cosmology]
-#transfer     = CLASS 
-transfer     = eisenstein
-Omega_m      = 0.302
-Omega_b      = 0.045
-Omega_L      = 0.698
-H0           = 70.3
-sigma_8      = 0.811
-nspec        = 0.961
-
-# anisotropic large scale tidal field
-#LSS_aniso_lx = 0.1
-#LSS_aniso_ly = 0.1
-#LSS_aniso_lz = -0.2
+# format	        = grafic2
+# filename        = ics_ramses
+# grafic_use_SPT  = yes

--- a/example_testing.conf
+++ b/example_testing.conf
@ -0,0 +1,33 @@
+[setup]
+GridRes      = 256
+BoxLength    = 6.28318530718
+zstart       = 0.0
+LPTorder     = 1
+SymplecticPT = no
+DoFixing     = no
+
+[execution]
+NumThreads   = 4
+
+[output]
+fname_hdf5   = output.hdf5
+fbase_analysis = output
+#format       = gadget2
+#filename     = ics_gadget.dat
+format	     = generic
+filename     = debug.hdf5
+generic_out_eulerian = yes
+
+[random]
+generator    = NGENIC
+seed         = 9001
+
+[cosmology]
+#transfer     = CLASS 
+transfer     = eisenstein
+Omega_m      = 1.0
+Omega_b      = 0.045
+Omega_L      = 0.0
+H0           = 70.3
+sigma_8      = 0.811
+nspec        = 0.961
--- a/external/class
+++ b/external/class
@ -1 +1 @@
-Subproject commit b34d7f6c2b72eab3a347c28e62298d62ca9dd69b
+Subproject commit 6adecae2f30172a94e003155090791abf509d995
--- a/external/class.cmake
+++ b/external/class.cmake
@ -32,6 +32,7 @@ if(ENABLE_CLASS)
      ${CMAKE_CURRENT_LIST_DIR}/class/build/history.o
      ${CMAKE_CURRENT_LIST_DIR}/class/build/hydrogen.o
      ${CMAKE_CURRENT_LIST_DIR}/class/build/hyperspherical.o
+      ${CMAKE_CURRENT_LIST_DIR}/class/tools/trigonometric_integrals.o
      ${CMAKE_CURRENT_LIST_DIR}/class/build/hyrectools.o
      ${CMAKE_CURRENT_LIST_DIR}/class/build/input.o
      ${CMAKE_CURRENT_LIST_DIR}/class/build/lensing.o
@ -78,6 +79,7 @@ if(ENABLE_CLASS)
      ${CMAKE_CURRENT_LIST_DIR}/class/tools/parser.c
      ${CMAKE_CURRENT_LIST_DIR}/class/tools/quadrature.c
      ${CMAKE_CURRENT_LIST_DIR}/class/tools/hyperspherical.c
+      ${CMAKE_CURRENT_LIST_DIR}/class/tools/trigonometric_integrals.c
      ${CMAKE_CURRENT_LIST_DIR}/class/tools/common.c
      ${CMAKE_CURRENT_LIST_DIR}/class/source/input.c
      ${CMAKE_CURRENT_LIST_DIR}/class/source/background.c
@ -131,9 +133,9 @@ macro(target_setup_class target_name)
  endif(ENABLE_CLASS)
 endmacro(target_setup_class)

-if(ENABLE_CLASS)
-  # test executable
-  add_executable(testTk
-    ${CMAKE_CURRENT_LIST_DIR}/class/cpp/testTk.cc)
-  target_setup_class(testTk)
-endif(ENABLE_CLASS)
+# if(ENABLE_CLASS)
+#   # test executable
+#   add_executable(testTk
+#     ${CMAKE_CURRENT_LIST_DIR}/class/cpp/testTk.cc)
+#   target_setup_class(testTk)
+# endif(ENABLE_CLASS)
--- a/external/fftwpp
+++ b/external/fftwpp
@ -0,0 +1 @@
+Subproject commit ec6b82cc1122ba029a7a7142cf836014e992e68c
--- a/external/panphasia/generic_lecuyer.f90
+++ b/external/panphasia/generic_lecuyer.f90
@ -0,0 +1,683 @@
+!=====================================================================================c
+!        
+! The code below was written by: Stephen Booth
+!                                Edinburgh Parallel Computing Centre
+!                                The University of Edinburgh
+!                                JCMB
+!                                Mayfield Road
+!                                Edinburgh EH9 3JZ
+!                                United Kingdom
+!
+! This file is part of the software made public in
+! Jenkins and Booth 2013  - arXiv:1306.XXXX
+!
+! The software computes the Panphasia Gaussian white noise field
+! realisation described in detail in Jenkins 2013 - arXiv:1306.XXXX
+! 
+!
+!
+! This software is free, subject to a agreeing licence conditions:
+!
+!
+! (i)  you will publish the phase descriptors and reference Jenkins (13) 
+!      for any new simulations that use Panphasia phases. You will pass on this 
+!      condition to others for any software or data you make available publically 
+!      or privately that makes use of Panphasia. 
+!
+! (ii) that you will ensure any publications using results derived from Panphasia 
+!      will be submitted as a final version to arXiv prior to or coincident with
+!      publication in a journal. 
+!
+!
+! (iii) that you report any bugs in this software as soon as confirmed to 
+!       A.R.Jenkins@durham.ac.uk 
+!
+! (iv)  that you understand that this software comes with no warranty and that is 
+!       your responsibility to ensure that it is suitable for the purpose that 
+!       you intend. 
+!
+!=====================================================================================c
+!{{{Rand_base (define kind types) 
+MODULE Rand_base
+! This module just declares the base types 
+! we may have to edit this to match to the target machine
+! we really need a power of 2 selected int kind in fortran-95 we could
+! do this with a PURE function I think.
+
+!
+! 10 decimal digits will hold 2^31
+!
+
+   INTEGER, PARAMETER :: Sint = SELECTED_INT_KIND(9)
+!  INTEGER, PARAMETER :: Sint = SELECTED_INT_KIND(10)
+!  INTEGER, PARAMETER :: Sint = 4
+
+!
+! 18-19 decimal digits will hold 2^63
+! but all 19 digit numbers require 2^65 :-(
+!
+
+   INTEGER, PARAMETER :: Dint = SELECTED_INT_KIND(17)
+!  INTEGER, PARAMETER :: Dint = SELECTED_INT_KIND(18)
+!  INTEGER, PARAMETER :: Dint = 8
+
+! type for index counters must hold Nstore
+  INTEGER, PARAMETER :: Ctype = SELECTED_INT_KIND(3)
+END MODULE Rand_base
+!}}}
+
+!{{{Rand_int (random integers mod 2^31-1) 
+
+MODULE Rand_int
+  USE Rand_base
+  IMPLICIT NONE
+! The general approach of this module is two have
+! two types Sint and Dint 
+! 
+! Sint should have at least 31 bits
+! dint shouldhave at least 63
+
+!{{{constants
+
+  INTEGER(KIND=Ctype), PARAMETER :: Nstate=5_Ctype
+  INTEGER(KIND=Ctype), PRIVATE, PARAMETER :: Nbatch=128_Ctype
+  INTEGER(KIND=Ctype), PRIVATE, PARAMETER :: Nstore=Nstate+Nbatch
+
+  INTEGER(KIND=Sint), PRIVATE, PARAMETER  :: M = 2147483647_Sint
+  INTEGER(KIND=Dint), PRIVATE, PARAMETER  :: Mask = 2147483647_Dint
+  INTEGER(KIND=Dint), PRIVATE, PARAMETER  :: A1 = 107374182_Dint
+  INTEGER(KIND=Dint), PRIVATE, PARAMETER  :: A5 = 104480_Dint
+  LOGICAL, PARAMETER :: Can_step_int=.TRUE.
+  LOGICAL, PARAMETER :: Can_reverse_int=.TRUE.
+
+!}}}
+
+!{{{Types
+!
+! This type holds the state of the generator
+!
+!{{{TYPE RAND_state
+
+TYPE RAND_state
+  PRIVATE
+  INTEGER(KIND=Sint) :: state(Nstore) 
+! do we need to re-fill state table this is reset when we initialise state.
+  LOGICAL :: need_fill 
+! position of the next state variable to output
+  INTEGER(KIND=Ctype) :: pos
+END TYPE RAND_state
+
+!}}}
+
+!
+! This type defines the offset type used for stepping.
+!
+!{{{TYPE RAND_offset
+
+TYPE RAND_offset
+  PRIVATE
+  INTEGER(KIND=Sint) :: poly(Nstate)
+END TYPE RAND_offset
+
+!}}}
+
+!}}}
+
+!{{{interface and overloads
+!
+! Allow automatic conversion between integers and offsets
+!
+INTERFACE ASSIGNMENT(=)
+  MODULE PROCEDURE Rand_set_offset
+  MODULE PROCEDURE Rand_load
+  MODULE PROCEDURE Rand_save
+  MODULE PROCEDURE Rand_seed
+END INTERFACE
+INTERFACE OPERATOR(+)
+  MODULE PROCEDURE Rand_add_offset
+END INTERFACE
+INTERFACE OPERATOR(*)
+  MODULE PROCEDURE Rand_mul_offset
+END INTERFACE
+
+!
+! overload + as the boost/stepping operator
+!
+INTERFACE OPERATOR(+)
+  MODULE PROCEDURE Rand_step
+  MODULE PROCEDURE Rand_boost
+END INTERFACE
+!}}}
+
+
+!{{{PUBLIC/PRIVATE 
+  PRIVATE reduce,mod_saxpy,mod_sdot,p_saxpy,p_sdot,poly_mult
+  PRIVATE poly_square, poly_power
+  PRIVATE fill_state, repack_state
+
+  PUBLIC Rand_sint, Rand_sint_vec
+
+  PUBLIC Rand_save, Rand_load
+  PUBLIC Rand_set_offset, Rand_add_offset, Rand_mul_offset
+  PUBLIC Rand_step, Rand_boost, Rand_seed
+!}}}
+
+CONTAINS
+  !{{{Internals
+  !{{{RECURSIVE FUNCTION reduce(A)
+  RECURSIVE FUNCTION reduce(A)
+  !
+  ! Take A Dint and reduce to Sint MOD M
+  !
+   INTEGER(KIND=Dint), INTENT(IN) :: A
+   INTEGER(KIND=Sint) reduce
+   INTEGER(KIND=Dint) tmp
+  
+    tmp = A  
+    DO WHILE( ISHFT(tmp, -31) .GT. 0 )
+      tmp = IAND(tmp,Mask) + ISHFT(tmp, -31)
+    END DO
+    IF( tmp .GE. M ) THEN
+      reduce = tmp - M
+    ELSE
+      reduce = tmp
+    END IF
+  END FUNCTION reduce
+  !}}}
+  !{{{RECURSIVE SUBROUTINE fill_state(x)
+  RECURSIVE SUBROUTINE fill_state(x)
+  TYPE(RAND_state), INTENT(INOUT) ::  x
+  INTEGER(KIND=Ctype) i
+  INTRINSIC IAND, ISHFT
+  INTEGER(KIND=Dint)  tmp
+    DO i=Nstate+1,Nstore
+      tmp = (x%state(i-5) * A5) + (x%state(i-1)*A1)
+      !
+      ! now reduce down to mod M efficiently
+      ! really hope the compiler in-lines this
+      !
+      ! x%state(i) = reduce(tmp)
+      DO WHILE( ISHFT(tmp, -31) .GT. 0 )
+        tmp = IAND(tmp,Mask) + ISHFT(tmp, -31)
+      END DO
+      IF( tmp .GE. M ) THEN
+        x%state(i) = tmp - M
+      ELSE
+        x%state(i) = tmp
+      END IF
+  
+    END DO
+    x%need_fill = .FALSE.
+  END SUBROUTINE fill_state
+  !}}}
+  !{{{RECURSIVE SUBROUTINE repack_state(x)
+  RECURSIVE SUBROUTINE repack_state(x)
+  TYPE(RAND_state), INTENT(INOUT) ::  x
+  INTEGER(KIND=Ctype) i
+    DO i=1,Nstate
+      x%state(i) = x%state(i+x%pos-(Nstate+1))
+    END DO
+    x%pos = Nstate + 1
+    x%need_fill = .TRUE.  
+  END SUBROUTINE repack_state
+  !}}}
+  !{{{RECURSIVE SUBROUTINE mod_saxpy(y,a,x)
+  RECURSIVE SUBROUTINE mod_saxpy(y,a,x)
+   INTEGER(KIND=Ctype) i
+   INTEGER(KIND=Sint) y(Nstate)
+   INTEGER(KIND=Sint) a
+   INTEGER(KIND=Sint) x(Nstate)
+   INTEGER(KIND=Dint) tx,ty,ta
+  
+     IF( a .EQ. 0_Sint ) RETURN
+  
+     ! We use KIND=Dint temporaries here to ensure
+     ! that we don't overflow in the expression
+  
+     ta = a
+     DO i=1,Nstate
+       ty=y(i)
+       tx=x(i)
+       y(i) = reduce(ty + ta * tx)
+     END DO
+  
+  END SUBROUTINE 
+  !}}}
+  !{{{RECURSIVE SUBROUTINE mod_sdot(res,x,y)
+  RECURSIVE SUBROUTINE mod_sdot(res,x,y)
+  INTEGER(KIND=Sint), INTENT(OUT) :: res
+  INTEGER(KIND=Sint), INTENT(IN) :: x(Nstate) , y(Nstate)
+  INTEGER(KIND=Dint) dx, dy, dtmp
+  INTEGER(KIND=Sint) tmp
+  INTEGER(KIND=Ctype) i
+  
+    tmp = 0
+    DO i=1,Nstate
+     dx = x(i)
+     dy = y(i)
+     dtmp = tmp
+     tmp = reduce(dtmp + dx * dy)
+    END DO
+    res = tmp
+  END SUBROUTINE
+  !}}}
+  !{{{RECURSIVE SUBROUTINE p_saxpy(y,a)
+  RECURSIVE SUBROUTINE p_saxpy(y,a)
+   ! Calculates mod_saxpy(y,a,P)
+   INTEGER(KIND=Sint), INTENT(INOUT) :: y(Nstate)
+   INTEGER(KIND=Sint), INTENT(IN) :: a
+   INTEGER(KIND=Dint) tmp, dy, da
+     dy = y(1)
+     da = a
+     tmp = dy + da*A5
+     y(1) = reduce(tmp)
+     dy = y(5)
+     da = a
+     tmp = dy + da*A1
+     y(5) = reduce(tmp)
+  
+  END SUBROUTINE
+  !}}}
+  !{{{RECURSIVE SUBROUTINE p_sdot(res,n,x)
+  RECURSIVE SUBROUTINE p_sdot(res,x)
+  INTEGER(KIND=Sint), INTENT(OUT) :: res
+  INTEGER(KIND=Sint), INTENT(IN) :: x(Nstate)
+  INTEGER(KIND=Dint) dx1, dx5, dtmp
+    dx1 = x(1)
+    dx5 = x(5)
+    
+    dtmp = A1*dx5 + A5*dx1
+    res = reduce(dtmp)
+  END SUBROUTINE
+  !}}}
+  !{{{RECURSIVE SUBROUTINE poly_mult(a,b)
+  RECURSIVE SUBROUTINE poly_mult(a,b)
+    INTEGER(KIND=Sint), INTENT(INOUT) :: a(Nstate)
+    INTEGER(KIND=Sint), INTENT(IN) :: b(Nstate)
+    INTEGER(KIND=Sint) tmp((2*Nstate) - 1)
+    INTEGER(KIND=Ctype) i
+  
+    tmp = 0_Sint
+  
+    DO i=1,Nstate
+      CALL mod_saxpy(tmp(i:Nstate+i-1),a(i), b)
+    END DO
+    DO i=(2*Nstate)-1, Nstate+1, -1
+      CALL P_SAXPY(tmp(i-Nstate:i-1),tmp(i))
+    END DO
+    a = tmp(1:Nstate)
+  END SUBROUTINE
+  !}}}
+  !{{{RECURSIVE SUBROUTINE poly_square(a)
+  RECURSIVE SUBROUTINE poly_square(a)
+    INTEGER(KIND=Sint), INTENT(INOUT) :: a(Nstate)
+    INTEGER(KIND=Sint) tmp((2*Nstate) - 1)
+    INTEGER(KIND=Ctype) i
+  
+    tmp = 0_Sint
+  
+    DO i=1,Nstate
+      CALL mod_saxpy(tmp(i:Nstate+i-1),a(i), a)
+    END DO
+    DO i=(2*Nstate)-1, Nstate+1, -1
+      CALL P_SAXPY(tmp(i-Nstate:i-1),tmp(i))
+    END DO
+    a = tmp(1:Nstate)
+  END SUBROUTINE
+  !}}}
+  !{{{RECURSIVE SUBROUTINE poly_power(poly,n)
+  RECURSIVE SUBROUTINE poly_power(poly,n)
+   INTEGER(KIND=Sint), INTENT(INOUT) :: poly(Nstate)
+   INTEGER, INTENT(IN) :: n
+   INTEGER nn
+   INTEGER(KIND=Sint) x(Nstate), out(Nstate)
+  
+   IF( n .EQ. 0 )THEN
+     poly = 0_Sint
+     poly(1) = 1_Sint
+     RETURN
+   ELSE IF( n .LT. 0 )THEN
+     poly = 0_Sint
+     RETURN
+   END IF
+  
+   out = 0_sint
+   out(1) = 1_Sint
+   x = poly
+   nn = n
+   DO WHILE( nn .GT. 0 )
+     IF( MOD(nn,2) .EQ. 1 )THEN
+       call poly_mult(out,x)
+     END IF
+     nn = nn/2
+     IF( nn .GT. 0 )THEN
+       call poly_square(x)
+     END IF
+   END DO 
+   poly = out
+  
+  END SUBROUTINE poly_power
+  !}}}
+  !}}}
+
+  !{{{RECURSIVE SUBROUTINE  Rand_seed( state, n )
+  RECURSIVE SUBROUTINE  Rand_seed( state, n )
+    TYPE(Rand_state), INTENT(OUT) :: state
+    INTEGER, INTENT(IN) :: n
+    ! initialise the genrator using a single integer
+    ! fist initialise to an arbitrary state then boost by a multiple 
+    ! of a long distance
+    !
+    ! state is moved forward by P^n steps
+    ! we want this to be ok for seperating parallel sequences on MPP machines
+    ! P is taken as a prime number as this should prevent strong correlations
+    ! when the generators are operated in tight lockstep.
+    ! equivalent points on different processors will also be related by a
+    ! primative polynomial
+    ! P is 2^48-59
+    TYPE(Rand_state) tmp
+    TYPE(Rand_offset), PARAMETER ::  P = &
+         Rand_offset( (/ 1509238949_Sint ,2146167999_Sint ,1539340803_Sint , &
+                     1041407428_Sint ,666274987_Sint /) )
+  
+    CALL Rand_load( tmp, (/ 5, 4, 3, 2, 1 /) )
+    state = Rand_boost( tmp, Rand_mul_offset(P, n ))
+  
+  END SUBROUTINE Rand_seed
+  !}}}
+  !{{{RECURSIVE SUBROUTINE Rand_load( state, input )
+  RECURSIVE SUBROUTINE Rand_load( state, input )
+  TYPE(RAND_state), INTENT(OUT) :: state
+  INTEGER, INTENT(IN) :: input(Nstate)
+  
+  INTEGER(KIND=Ctype) i
+  
+    state%state = 0_Sint
+    DO i=1,Nstate
+      state%state(i) = MOD(INT(input(i),KIND=Sint),M)
+    END DO
+    state%need_fill = .TRUE.
+    state%pos = Nstate + 1
+  END SUBROUTINE Rand_load
+  !}}}
+  !{{{RECURSIVE SUBROUTINE Rand_save( save_vec,state )
+  RECURSIVE SUBROUTINE Rand_save( save_vec, x ) 
+  INTEGER, INTENT(OUT) ::  save_vec(Nstate)
+  TYPE(RAND_state), INTENT(IN) ::  x
+  
+  INTEGER(KIND=Ctype) i
+    DO i=1,Nstate
+      save_vec(i) = x%state(x%pos-(Nstate+1) + i)
+    END DO
+  END SUBROUTINE Rand_save
+  !}}}
+
+  !{{{RECURSIVE SUBROUTINE Rand_set_offset( offset, n )
+  RECURSIVE SUBROUTINE Rand_set_offset( offset, n )
+  TYPE(Rand_offset), INTENT(OUT) :: offset
+  INTEGER, INTENT(IN) :: n
+  
+    offset%poly = 0_Sint
+    IF ( n .GE. 0 ) THEN
+      offset%poly(2) = 1_Sint
+      call poly_power(offset%poly,n)
+    ELSE
+      !
+      ! This is X^-1 
+      !
+      offset%poly(4) = 858869107_Sint
+      offset%poly(5) = 1840344978_Sint    
+      call poly_power(offset%poly,-n)
+    END IF
+  END SUBROUTINE Rand_set_offset
+  !}}}
+  !{{{TYPE(Rand_offset) RECURSIVE FUNCTION Rand_add_offset( a, b )
+  TYPE(Rand_offset) RECURSIVE FUNCTION Rand_add_offset( a, b )
+  TYPE(Rand_offset), INTENT(IN) :: a, b
+  
+    Rand_add_offset = a
+    CALL poly_mult(Rand_add_offset%poly,b%poly)
+    RETURN
+  END FUNCTION Rand_add_offset
+  !}}}
+  !{{{TYPE(Rand_offset) RECURSIVE  FUNCTION Rand_mul_offset( a, n )
+  TYPE(Rand_offset) RECURSIVE  FUNCTION Rand_mul_offset( a, n )
+  TYPE(Rand_offset), INTENT(IN) :: a
+  INTEGER, INTENT(IN) :: n
+    Rand_mul_offset = a
+    CALL poly_power(Rand_mul_offset%poly,n)
+    RETURN
+  END FUNCTION Rand_mul_offset
+  !}}}
+  !{{{RECURSIVE FUNCTION Rand_boost(x, offset)
+  RECURSIVE FUNCTION Rand_boost(x, offset)
+  TYPE(Rand_state) Rand_boost
+  TYPE(Rand_state), INTENT(IN) ::  x
+  TYPE(Rand_offset), INTENT(IN) :: offset
+  INTEGER(KIND=Sint) tmp(2*Nstate-1), res(Nstate)
+  INTEGER(KIND=Ctype) i
+  
+    DO i=1,Nstate
+      tmp(i) = x%state(x%pos-(Nstate+1) + i)
+    END DO
+    tmp(Nstate+1:) = 0_Sint
+  
+    DO i=1,Nstate-1
+      call P_SDOT(tmp(i+Nstate),tmp(i:Nstate+i-1))
+    END DO
+  
+    DO i=1,Nstate
+      call mod_sdot(res(i),offset%poly,tmp(i:Nstate+i-1))
+    END DO
+    Rand_boost%state = 0_Sint
+    DO i=1,Nstate
+      Rand_boost%state(i) = res(i)
+    END DO
+    Rand_boost%need_fill = .TRUE.
+    Rand_boost%pos = Nstate + 1
+  
+  END FUNCTION Rand_boost
+  !}}}
+  !{{{RECURSIVE FUNCTION Rand_step(x, n)
+  RECURSIVE FUNCTION Rand_step(x, n)
+  TYPE(Rand_state) Rand_step
+  TYPE(RAND_state), INTENT(IN) ::  x
+  INTEGER, INTENT(IN) :: n
+  TYPE(Rand_offset) tmp
+  
+    CALL Rand_set_offset(tmp,n)
+    Rand_step=Rand_boost(x,tmp)
+  
+  END FUNCTION
+  !}}}
+  
+  !{{{RECURSIVE FUNCTION Rand_sint(x)
+  RECURSIVE FUNCTION Rand_sint(x)
+    TYPE(RAND_state), INTENT(INOUT) :: x
+    INTEGER(KIND=Sint)  Rand_sint
+    IF( x%pos .GT. Nstore )THEN
+      CALL repack_state(x)
+    END IF
+    IF( x%need_fill ) CALL fill_state(x)
+    Rand_sint = x%state(x%pos)
+    x%pos = x%pos + 1
+    RETURN
+  END FUNCTION Rand_sint
+  !}}}
+  !{{{RECURSIVE SUBROUTINE Rand_sint_vec(iv,x)
+  RECURSIVE SUBROUTINE Rand_sint_vec(iv,x)
+    INTEGER(KIND=Sint), INTENT(OUT)  :: iv(:)
+    TYPE(RAND_state), INTENT(INOUT)  ::  x
+    INTEGER left,start, chunk, i
+  
+    start=1
+    left=SIZE(iv)
+    DO WHILE( left .GT. 0 )
+      IF( x%pos .GT. Nstore )THEN
+        CALL repack_state(x)
+      END IF
+      IF( x%need_fill ) CALL fill_state(x)
+  
+      chunk = MIN(left,Nstore-x%pos+1)
+      DO i=0,chunk-1
+        iv(start+i) = x%state(x%pos+i)
+      END DO
+      start = start + chunk
+      x%pos = x%pos + chunk
+      left = left - chunk
+    END DO
+  
+    RETURN
+  END SUBROUTINE Rand_sint_vec
+  !}}}
+
+
+END MODULE Rand_int
+
+!}}}
+
+!{{{Rand (use Rand_int to make random reals)
+
+MODULE Rand
+  USE Rand_int
+  IMPLICIT NONE
+
+!{{{Parameters
+
+  INTEGER, PARAMETER :: RAND_kind1 = SELECTED_REAL_KIND(10)
+  INTEGER, PARAMETER :: RAND_kind2 = SELECTED_REAL_KIND(6)
+
+  INTEGER, PARAMETER, PRIVATE :: Max_block=100
+  INTEGER(KIND=Sint), PRIVATE, PARAMETER  :: M = 2147483647
+  REAL(KIND=RAND_kind1), PRIVATE, PARAMETER :: INVMP1_1 = ( 1.0_RAND_kind1 / 2147483647.0_RAND_kind1 )
+  REAL(KIND=RAND_kind2), PRIVATE, PARAMETER :: INVMP1_2 = ( 1.0_RAND_kind2 / 2147483647.0_RAND_kind2 )
+
+  LOGICAL, PARAMETER :: Can_step = Can_step_int
+  LOGICAL, PARAMETER :: Can_reverse = Can_reverse_int
+
+!}}}
+  PUBLIC Rand_real
+
+
+INTERFACE Rand_real
+  MODULE PROCEDURE Rand_real1
+  MODULE PROCEDURE Rand_real2
+  MODULE PROCEDURE Rand_real_vec1
+  MODULE PROCEDURE Rand_real_vec2
+END INTERFACE
+
+
+CONTAINS
+
+  !{{{RECURSIVE SUBROUTINE Rand_real1(y,x)
+  RECURSIVE SUBROUTINE Rand_real1(y,x)
+    REAL(KIND=RAND_kind1), INTENT(OUT) :: y
+    TYPE(RAND_state), INTENT(INOUT) ::  x
+    INTEGER(KIND=Sint) Z
+  
+    Z = Rand_sint(x)
+    IF (Z .EQ. 0) Z = M
+  
+    y = ((Z-0.5d0)*INVMP1_1)
+    RETURN
+  END SUBROUTINE Rand_real1
+  !}}}
+  !{{{RECURSIVE SUBROUTINE Rand_real2(y,x)
+  RECURSIVE SUBROUTINE Rand_real2(y,x)
+    REAL(KIND=RAND_kind2), INTENT(OUT) :: y
+    TYPE(RAND_state), INTENT(INOUT) ::  x
+    INTEGER(KIND=Sint) Z
+  
+    Z = Rand_sint(x)
+    IF (Z .EQ. 0) Z = M
+  
+    y = ((Z-0.5d0)*INVMP1_1)  ! generate in double and truncate.
+    RETURN
+  END SUBROUTINE Rand_real2
+  !}}}
+
+  !{{{RECURSIVE SUBROUTINE Rand_real_vec1(rv,x)
+  RECURSIVE SUBROUTINE Rand_real_vec1(rv,x)
+    TYPE(RAND_state), INTENT(INOUT) ::  x
+    REAL(KIND=RAND_kind1)  rv(:)
+    INTEGER left,start, chunk, i
+    INTEGER(KIND=Sint) Z
+    INTEGER(KIND=Sint) temp(MIN(SIZE(rv),Max_block))
+  
+    start=0
+    left=SIZE(rv)
+    DO WHILE( left .GT. 0 )
+      chunk = MIN(left,Max_block)
+      CALL Rand_sint_vec(temp(1:chunk),x)
+      DO i=1,chunk
+       Z = temp(i)
+       IF (Z .EQ. 0) Z = M
+       rv(start+i) = (Z-0.5d0)*INVMP1_1
+      END DO 
+      start = start + chunk
+      left = left - chunk
+    END DO
+  
+    RETURN
+  END SUBROUTINE Rand_real_vec1
+  !}}}
+  !{{{RECURSIVE SUBROUTINE Rand_real_vec2(rv,x)
+  RECURSIVE SUBROUTINE Rand_real_vec2(rv,x)
+    TYPE(RAND_state), INTENT(INOUT) ::  x
+    REAL(KIND=RAND_kind2)  rv(:)
+    INTEGER left,start, chunk, i
+    INTEGER(KIND=Sint) Z
+    INTEGER(KIND=Sint) temp(MIN(SIZE(rv),Max_block))
+  
+    start=0
+    left=SIZE(rv)
+    DO WHILE( left .GT. 0 )
+      chunk = MIN(left,Max_block)
+      CALL Rand_sint_vec(temp(1:chunk),x)
+      DO i=1,chunk
+       Z = temp(i)
+       IF (Z .EQ. 0) Z = M
+       rv(start+i) = (Z-0.5d0)*INVMP1_2
+      END DO 
+      start = start + chunk
+      left = left - chunk
+    END DO
+  
+    RETURN
+  END SUBROUTINE Rand_real_vec2
+  !}}}
+END MODULE Rand
+
+!}}}
+
+!{{{test program
+! PROGRAM test_random
+! use Rand
+!     TYPE(RAND_state) x
+!     REAL y
+!      CALL Rand_load(x,(/5,4,3,2,1/)) 
+!      DO I=0,10
+!       CALL Rand_real(y,x)
+!       WRITE(*,10) I,y
+!      END DO
+!
+!10    FORMAT(I10,E25.16)
+!
+!     END
+
+!         0   0.5024326127022505E-01
+!         1   0.8260946767404675E-01
+!         2   0.2123264316469431E-01
+!         3   0.6926658791489899E+00
+!         4   0.2076155943796039E+00
+!         5   0.4327449947595596E-01
+!         6   0.2204052871093154E-01
+!         7   0.1288446951657534E+00
+!         8   0.4859915426932275E+00
+!         9   0.5721384193748236E-01
+!        10   0.7996825082227588E+00
+!
+
+
+!}}}
+
--- a/external/panphasia/panphasia_routines.f
+++ b/external/panphasia/panphasia_routines.f
--- a/ics.conf
+++ b/ics.conf
@ -0,0 +1,62 @@
+[setup]
+# number of grid cells per linear dimension for calculations = particles for sc initial load
+GridRes      = 128 
+# length of the box in Mpc/h
+BoxLength    = 200
+# starting redshift
+zstart       = 24.0 
+# order of the LPT to be used (1,2 or 3)
+LPTorder     = 1
+# also do baryon ICs?
+DoBaryons    = no
+# do mode fixing à la Angulo&Pontzen
+DoFixing     = yes
+# particle load, can be 'sc' (1x), 'bcc' (2x), 'fcc' (4x), or 'rsc' (8x)
+ParticleLoad = sc
+
+[testing]
+# enables diagnostic output
+# can be 'none' (default), 'potentials_and_densities', 'velocity_displacement_symmetries', or 'convergence'
+#test = potentials_and_densities
+#test = convergence
+test = none
+
+[execution]
+NumThreads   = 1
+
+[output]
+fname_hdf5   = output.hdf5
+fbase_analysis = output
+
+#format       = gadget2
+#filename     = ics_gadget.dat
+
+format       = generic
+filename     = debug.hdf5
+#generic_out_eulerian = yes
+
+#format	       = grafic2
+#filename       = ics_ramses
+#grafic_use_SPT = yes
+
+[random]
+generator    = NGENIC
+seed         = 9001
+
+[cosmology]
+transfer     = eisenstein
+#transfer     = CLASS 
+#transfer     = eisenstein_wdm
+#WDMmass      = 0.1
+Omega_m      = 0.302
+Omega_b      = 0.045
+Omega_L      = 0.698
+H0           = 70.3
+sigma_8      = 0.811
+nspec        = 0.961
+
+# anisotropic large scale tidal field
+#LSS_aniso_lx = 0.1
+#LSS_aniso_ly = 0.1
+#LSS_aniso_lz = -0.2
+
--- a/src/plugins/HDF_IO.hh
+++ b/src/plugins/HDF_IO.hh
@ -1,5 +1,5 @@
-#ifndef __HDF_IO_HH
-#define __HDF_IO_HH
+#pragma once
+#if defined(USE_HDF5)

 #define H5_USE_16_API

@ -193,9 +193,9 @@ inline void HDFReadDataset( const std::string Filename, const std::string ObjNam

  int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
  
-  hsize_t dimsize[ndims];
+  std::vector<hsize_t> dimsize(ndims,0);

-  H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
+  H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );

  HDF_StorageSize = 1;
  for(int i=0; i<ndims; ++i )
@ -1082,4 +1082,5 @@ inline void HDFWriteGroupAttribute<std::string>( const std::string Filename, con
  H5Gclose( HDF_GroupID );
  H5Fclose( HDF_FileID );
 }
-#endif
+
+#endif // USE_HDF5
--- a/include/bounding_box.hh
+++ b/include/bounding_box.hh
@ -1,16 +1,16 @@
 #pragma once

-#include <vec3.hh>
+#include <math/vec3.hh>

 template <typename T>
 struct bounding_box
 {
-    vec3<T> x1_, x2_;
+    vec3_t<T> x1_, x2_;

    bounding_box(void)
    { }

-    bounding_box( const vec3<T>& x1, const vec3<T>& x2)
+    bounding_box( const vec3_t<T>& x1, const vec3_t<T>& x2)
    : x1_(x1), x2_(x2)
    { }

--- a/include/cmake_config.hh.in
+++ b/include/cmake_config.hh.in
@ -0,0 +1,34 @@
+#pragma once
+
+constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
+
+#define USE_PRECISION_${CODE_PRECISION}
+#if defined(USE_PRECISION_FLOAT)
+  constexpr char CMAKE_PRECISION_STR[] = "single";
+#elif defined(USE_PRECISION_DOUBLE)
+  constexpr char CMAKE_PRECISION_STR[] = "double";
+#elif defined(USE_PRECISION_LONGDOUBLE)
+  constexpr char CMAKE_PRECISION_STR[] = "long double";
+#endif 
+
+#define USE_CONVOLVER_${CONVOLVER_TYPE}
+#if defined(USE_CONVOLVER_ORSZAG)
+  constexpr char CMAKE_CONVOLVER_STR[] = "Orszag3/2";
+#elif defined(USE_CONVOLVER_NAIVE)
+  constexpr char CMAKE_CONVOLVER_STR[] = "Aliased";
+#endif
+
+#if defined(ENABLE_PLT)
+  constexpr char CMAKE_PLT_STR[] = "PLT corr. on";
+#else
+  constexpr char CMAKE_PLT_STR[] = "PLT corr. off";
+#endif
+
+// These variables are autogenerated and compiled
+// into the library by the version.cmake script. do not touch!
+extern "C"
+{
+  extern const char *GIT_TAG;
+  extern const char *GIT_REV;
+  extern const char *GIT_BRANCH;
+}
--- a/include/config_file.hh
+++ b/include/config_file.hh
@ -12,20 +12,20 @@
 #include <logger.hh>

 /*!
- * @class ConfigFile
+ * @class config_file
 * @brief provides read/write access to configuration options
 *
 * This class provides access to the configuration file. The
 * configuration is stored in hash-pairs and can be queried and
 * validated by the responsible class/routine
 */
-class ConfigFile {
+class config_file {

  //! current line number
-  unsigned m_iLine;
+  unsigned iline_;

  //! hash table for key/value pairs, stored as strings
-  std::map<std::string, std::string> m_Items;
+  std::map<std::string, std::string> items_;

 public:
  //! removes all white space from string source
@ -59,42 +59,42 @@ public:
   * @param oval the interpreted/converted value
   */
  template <class in_value, class out_value>
-  void Convert(const in_value &ival, out_value &oval) const {
+  void convert(const in_value &ival, out_value &oval) const {
    std::stringstream ss;
    ss << ival; //.. insert value into stream
    ss >> oval; //.. retrieve value from stream

    if (!ss.eof()) {
      //.. conversion error
-      csoca::elog << "Error: conversion of \'" << ival << "\' failed."
+      music::elog << "Error: conversion of \'" << ival << "\' failed."
                << std::endl;
-      throw ErrInvalidConversion(std::string("invalid conversion to ") +
+      throw except_invalid_conversion(std::string("invalid conversion to ") +
                                 typeid(out_value).name() + '.');
    }
  }

  //! constructor of class config_file
-  /*! @param FileName the path/name of the configuration file to be parsed
+  /*! @param filename the path/name of the configuration file to be parsed
   */
-  explicit ConfigFile(std::string const &FileName) : m_iLine(0), m_Items() {
-    std::ifstream file(FileName.c_str());
+  explicit config_file(std::string const &filename) : iline_(0), items_() {
+    std::ifstream file(filename.c_str());

    if (!file.is_open()){
-      csoca::elog << "Could not open config file \'" << FileName << "\'." << std::endl;
+      music::elog << "Could not open config file \'" << filename << "\'." << std::endl;
      throw std::runtime_error(
-          std::string("Error: Could not open config file \'") + FileName +
+          std::string("Error: Could not open config file \'") + filename +
          std::string("\'"));
    }

    std::string line;
    std::string name;
    std::string value;
-    std::string inSection;
-    int posEqual;
-    m_iLine = 0;
+    std::string in_section;
+    int pos_equal;
+    iline_ = 0;
    //.. walk through all lines ..
    while (std::getline(file, line)) {
-      ++m_iLine;
+      ++iline_;
      //.. encounterd EOL ?
      if (!line.length())
        continue;
@ -106,31 +106,31 @@ public:

      //.. encountered section tag ?
      if (line[0] == '[') {
-        inSection = trim(line.substr(1, line.find(']') - 1));
+        in_section = trim(line.substr(1, line.find(']') - 1));
        continue;
      }

      //.. seek end of entry name ..
-      posEqual = line.find('=');
-      name = trim(line.substr(0, posEqual));
-      value = trim(line.substr(posEqual + 1));
+      pos_equal = line.find('=');
+      name = trim(line.substr(0, pos_equal));
+      value = trim(line.substr(pos_equal + 1));

-      if ((size_t)posEqual == std::string::npos &&
+      if ((size_t)pos_equal == std::string::npos &&
          (name.size() != 0 || value.size() != 0)) {
-        csoca::wlog << "Ignoring non-assignment in " << FileName << ":"
-                  << m_iLine << std::endl;
+        music::wlog << "Ignoring non-assignment in " << filename << ":"
+                  << iline_ << std::endl;
        continue;
      }

      if (name.length() == 0 && value.size() != 0) {
-        csoca::wlog << "Ignoring assignment missing entry name in "
-                  << FileName << ":" << m_iLine << std::endl;
+        music::wlog << "Ignoring assignment missing entry name in "
+                  << filename << ":" << iline_ << std::endl;
        continue;
      }

      if (value.length() == 0 && name.size() != 0) {
-        csoca::wlog << "Empty entry will be ignored in " << FileName << ":"
-                  << m_iLine << std::endl;
+        music::wlog << "Empty entry will be ignored in " << filename << ":"
+                  << iline_ << std::endl;
        continue;
      }

@ -138,12 +138,12 @@ public:
        continue;

      //.. add key/value pair to hash table ..
-      if (m_Items.find(inSection + '/' + name) != m_Items.end()) {
-        csoca::wlog << "Redeclaration overwrites previous value in "
-                  << FileName << ":" << m_iLine << std::endl;
+      if (items_.find(in_section + '/' + name) != items_.end()) {
+        music::wlog << "Redeclaration overwrites previous value in "
+                  << filename << ":" << iline_ << std::endl;
      }

-      m_Items[inSection + '/' + name] = value;
+      items_[in_section + '/' + name] = value;
    }
  }

@ -151,8 +151,8 @@ public:
  /*! @param key the key value, usually "section/key"
   *  @param value the value of the key, also a string
   */
-  void InsertValue(std::string const &key, std::string const &value) {
-    m_Items[key] = value;
+  void insert_value(std::string const &key, std::string const &value) {
+    items_[key] = value;
  }

  //! inserts a key/value pair in the hash map
@ -160,9 +160,9 @@ public:
   *  @param key the key value usually "section/key"
   *  @param value the value of the key, also a string
   */
-  void InsertValue(std::string const &section, std::string const &key,
+  void insert_value(std::string const &section, std::string const &key,
                   std::string const &value) {
-    m_Items[section + '/' + key] = value;
+    items_[section + '/' + key] = value;
  }

  //! checks if a key is part of the hash map
@ -170,10 +170,10 @@ public:
   *  @param key the key name to be checked
   *  @return true if the key is present, false otherwise
   */
-  bool ContainsKey(std::string const &section, std::string const &key) {
+  bool contains_key(std::string const &section, std::string const &key) {
    std::map<std::string, std::string>::const_iterator i =
-        m_Items.find(section + '/' + key);
-    if (i == m_Items.end())
+        items_.find(section + '/' + key);
+    if (i == items_.end())
      return false;
    return true;
  }
@ -182,57 +182,57 @@ public:
  /*! @param key the key name to be checked
   *  @return true if the key is present, false otherwise
   */
-  bool ContainsKey(std::string const &key) {
-    std::map<std::string, std::string>::const_iterator i = m_Items.find(key);
-    if (i == m_Items.end())
+  bool contains_key(std::string const &key) {
+    std::map<std::string, std::string>::const_iterator i = items_.find(key);
+    if (i == items_.end())
      return false;
    return true;
  }

  //! return value of a key
-  /*! returns the value of a given key, throws a ErrItemNotFound
+  /*! returns the value of a given key, throws a except_item_not_found
   *  exception if the key is not available in the hash map.
   *  @param key the key name
   *  @return the value of the key
-   *  @sa ErrItemNotFound
+   *  @sa except_item_not_found
   */
-  template <class T> T GetValue(std::string const &key) const {
-    return GetValue<T>("", key);
+  template <class T> T get_value(std::string const &key) const {
+    return get_value<T>("", key);
  }

  //! return value of a key
-  /*! returns the value of a given key, throws a ErrItemNotFound
+  /*! returns the value of a given key, throws a except_item_not_found
   *  exception if the key is not available in the hash map.
   *  @param section the section name for the key
   *  @param key the key name
   *  @return the value of the key
-   *  @sa ErrItemNotFound
+   *  @sa except_item_not_found
   */
  template <class T>
-  T GetValueBasic(std::string const &section, std::string const &key) const {
+  T get_value_basic(std::string const &section, std::string const &key) const {
    T r;
    std::map<std::string, std::string>::const_iterator i =
-        m_Items.find(section + '/' + key);
-    if (i == m_Items.end()){
-      throw ErrItemNotFound('\'' + section + '/' + key +
+        items_.find(section + '/' + key);
+    if (i == items_.end()){
+      throw except_item_not_found('\'' + section + '/' + key +
                            std::string("\' not found."));
    }

-    Convert(i->second, r);
+    convert(i->second, r);
    return r;
  }

  template <class T>
-  T GetValue(std::string const &section, std::string const &key) const
+  T get_value(std::string const &section, std::string const &key) const
  {
    T r;
    try
    {
-      r = GetValueBasic<T>(section, key);
+      r = get_value_basic<T>(section, key);
    }
-    catch (ErrItemNotFound& e)
+    catch (except_item_not_found& e)
    {
-      csoca::elog << e.what() << std::endl;
+      music::elog << e.what() << std::endl;
      throw;
    }
    return r;
@ -240,40 +240,41 @@ public:

  //! exception safe version of getValue
  /*! returns the value of a given key, returns a default value rather
-   *  than a ErrItemNotFound exception if the key is not found.
+   *  than a except_item_not_found exception if the key is not found.
   *  @param section the section name for the key
   *  @param key the key name
   *  @param default_value the value that is returned if the key is not found
   *  @return the key value (if key found) otherwise default_value
   */
  template <class T>
-  T GetValueSafe(std::string const &section, std::string const &key,
+  T get_value_safe(std::string const &section, std::string const &key,
                 T default_value) const {
    T r;
    try {
-      r = GetValueBasic<T>(section, key);
-    } catch (ErrItemNotFound&) {
+      r = get_value_basic<T>(section, key);
+    } catch (except_item_not_found&) {
      r = default_value;
+      music::dlog << "Item \'" << section << "/" << key << " not found in config. Default = \'" << default_value << "\'" << std::endl;
    }
    return r;
  }

  //! exception safe version of getValue
  /*! returns the value of a given key, returns a default value rather
-   *  than a ErrItemNotFound exception if the key is not found.
+   *  than a except_item_not_found exception if the key is not found.
   *  @param key the key name
   *  @param default_value the value that is returned if the key is not found
   *  @return the key value (if key found) otherwise default_value
   */
  template <class T>
-  T GetValueSafe(std::string const &key, T default_value) const {
-    return GetValueSafe("", key, default_value);
+  T get_value_safe(std::string const &key, T default_value) const {
+    return get_value_safe("", key, default_value);
  }

  //! dumps all key-value pairs to a std::ostream
-  void Dump(std::ostream &out) {
-    std::map<std::string, std::string>::const_iterator i = m_Items.begin();
-    while (i != m_Items.end()) {
+  void dump(std::ostream &out) {
+    std::map<std::string, std::string>::const_iterator i = items_.begin();
+    while (i != items_.end()) {
      if (i->second.length() > 0)
        out << std::setw(24) << std::left << i->first << "  =  " << i->second
            << std::endl;
@ -281,12 +282,12 @@ public:
    }
  }

-  void LogDump(void) {
-    csoca::ilog << "List of all configuration options:" << std::endl;
-    std::map<std::string, std::string>::const_iterator i = m_Items.begin();
-    while (i != m_Items.end()) {
+  void dump_to_log(void) {
+    music::ilog << "List of all configuration options:" << std::endl;
+    std::map<std::string, std::string>::const_iterator i = items_.begin();
+    while (i != items_.end()) {
      if (i->second.length() > 0)
-        csoca::ilog << std::setw(28) << i->first << " = " << i->second
+        music::ilog << std::setw(28) << i->first << " = " << i->second
                  << std::endl;
      ++i;
    }
@ -295,16 +296,16 @@ public:
  //--- EXCEPTIONS ---

  //! runtime error that is thrown if key is not found in getValue
-  class ErrItemNotFound : public std::runtime_error {
+  class except_item_not_found : public std::runtime_error {
  public:
-    ErrItemNotFound(std::string itemname)
+    except_item_not_found(std::string itemname)
        : std::runtime_error(itemname.c_str()) {}
  };

  //! runtime error that is thrown if type conversion fails
-  class ErrInvalidConversion : public std::runtime_error {
+  class except_invalid_conversion : public std::runtime_error {
  public:
-    ErrInvalidConversion(std::string errmsg) : std::runtime_error(errmsg) {}
+    except_invalid_conversion(std::string errmsg) : std::runtime_error(errmsg) {}
  };

  //! runtime error that is thrown if identifier is not found in keys
@ -323,14 +324,14 @@ public:
 //...           like "true" and "false" etc.
 //...           converts the string to type bool, returns type bool ...
 template <>
-inline bool ConfigFile::GetValue<bool>(std::string const &strSection,
+inline bool config_file::get_value<bool>(std::string const &strSection,
                                       std::string const &strEntry) const {
-  std::string r1 = GetValue<std::string>(strSection, strEntry);
+  std::string r1 = get_value<std::string>(strSection, strEntry);
  if (r1 == "true" || r1 == "yes" || r1 == "on" || r1 == "1")
    return true;
  if (r1 == "false" || r1 == "no" || r1 == "off" || r1 == "0")
    return false;
-  csoca::elog << "Illegal identifier \'" << r1 << "\' in \'" << strEntry << "\'." << std::endl;
+  music::elog << "Illegal identifier \'" << r1 << "\' in \'" << strEntry << "\'." << std::endl;
  throw ErrIllegalIdentifier(std::string("Illegal identifier \'") + r1 +
                             std::string("\' in \'") + strEntry +
                             std::string("\'."));
@ -338,17 +339,17 @@ inline bool ConfigFile::GetValue<bool>(std::string const &strSection,
 }

 template <>
-inline bool ConfigFile::GetValueSafe<bool>(std::string const &strSection,
+inline bool config_file::get_value_safe<bool>(std::string const &strSection,
                                           std::string const &strEntry,
                                           bool defaultValue) const {
  std::string r1;
  try {
-    r1 = GetValueBasic<std::string>(strSection, strEntry);
+    r1 = get_value_basic<std::string>(strSection, strEntry);
    if (r1 == "true" || r1 == "yes" || r1 == "on" || r1 == "1")
      return true;
    if (r1 == "false" || r1 == "no" || r1 == "off" || r1 == "0")
      return false;
-  } catch (ErrItemNotFound&) {
+  } catch (except_item_not_found&) {
    return defaultValue;
  }
  return defaultValue;
@ -356,7 +357,7 @@ inline bool ConfigFile::GetValueSafe<bool>(std::string const &strSection,

 template <>
 inline void
-ConfigFile::Convert<std::string, std::string>(const std::string &ival,
+config_file::convert<std::string, std::string>(const std::string &ival,
                                              std::string &oval) const {
  oval = ival;
 }
--- a/include/convolution.hh
+++ b/include/convolution.hh
@ -333,7 +333,7 @@ public:
        crecvbuf_ = new ccomplex_t[maxslicesz_ / 2];
        recvbuf_ = reinterpret_cast<real_t *>(&crecvbuf_[0]);

-        int ntasks(MPI_Get_size());
+        int ntasks(MPI::get_size());

        offsets_.assign(ntasks, 0);
        offsetsp_.assign(ntasks, 0);
@ -415,12 +415,12 @@ private:
    {
        assert(fp.space_ == kspace_id);

-        const double rfac = std::pow(1.5, 1.5);
+        const real_t rfac = std::pow(1.5, 1.5);

        fp.zero();

 #if !defined(USE_MPI) ////////////////////////////////////////////////////////////////////////////////////
-        size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3};
+        const size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3};

 #pragma omp parallel for
        for (size_t i = 0; i < 2 * fp.size(0) / 3; ++i)
@ -429,10 +429,9 @@ private:
            for (size_t j = 0; j < 2 * fp.size(1) / 3; ++j)
            {
                size_t jp = (j > nhalf[1]) ? j + nhalf[1] : j;
-                for (size_t k = 0; k < 2 * fp.size(2) / 3; ++k)
+                for (size_t k = 0; k < nhalf[2]+1; ++k)
                {
                    size_t kp = (k > nhalf[2]) ? k + nhalf[2] : k;
-                    // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue;
                    fp.kelem(ip, jp, kp) = kfunc(i, j, k) * rfac;
                }
            }
@ -445,7 +444,7 @@ private:
        /////////////////////////////////////////////////////////////////////

        double tstart = get_wtime();
-        csoca::dlog << "[MPI] Started scatter for convolution" << std::endl;
+        music::dlog << "[MPI] Started scatter for convolution" << std::endl;

        //... collect offsets

@ -460,7 +459,10 @@ private:
        size_t slicesz = fbuf_->size(1) * fbuf_->size(3);

        MPI_Datatype datatype =
-            (typeid(data_t) == typeid(float)) ? MPI_COMPLEX : (typeid(data_t) == typeid(double)) ? MPI_DOUBLE_COMPLEX : MPI_BYTE;
+            (typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX 
+            : (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX 
+            : (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX
+            : MPI_BYTE;

        // fill MPI send buffer with results of kfunc

@ -587,7 +589,7 @@ private:
        // std::cerr << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! <<<<<"
        // << std::endl;  ofs << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed!
        // <<<<<" << std::endl;
-        csoca::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n",
+        music::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n",
                          get_wtime() - tstart);

 #endif /// end of ifdef/ifndef USE_MPI ///////////////////////////////////////////////////////////////
@ -596,7 +598,7 @@ private:
    template <typename operator_t>
    void unpad(const Grid_FFT<data_t> &fp, operator_t output_op)
    {
-        const double rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(fbuf_->n_[0] * fbuf_->n_[1] * fbuf_->n_[2]);
+        const real_t rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(fbuf_->n_[0] * fbuf_->n_[1] * fbuf_->n_[2]);

        // make sure we're in Fourier space...
        assert(fp.space_ == kspace_id);
@ -615,8 +617,11 @@ private:
                for (size_t k = 0; k < fbuf_->size(2); ++k)
                {
                    size_t kp = (k > nhalf[2]) ? k + nhalf[2] : k;
-                    // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue;
                    fbuf_->kelem(i, j, k) = fp.kelem(ip, jp, kp) / rfac;
+                    // zero Nyquist modes since they are not unique after convolution
+                    if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]){
+                        fbuf_->kelem(i, j, k) = 0.0; 
+                    }
                }
            }
        }
@ -634,7 +639,7 @@ private:

        double tstart = get_wtime();

-        csoca::dlog << "[MPI] Started gather for convolution";
+        music::dlog << "[MPI] Started gather for convolution";

        MPI_Barrier(MPI_COMM_WORLD);

@ -645,7 +650,10 @@ private:
        size_t slicesz = fp.size(1) * fp.size(3);

        MPI_Datatype datatype =
-            (typeid(data_t) == typeid(float)) ? MPI_COMPLEX : (typeid(data_t) == typeid(double)) ? MPI_DOUBLE_COMPLEX : MPI_BYTE;
+            (typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX 
+            : (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX 
+            : (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX 
+            : MPI_BYTE;

        MPI_Status status;

@ -685,7 +693,7 @@ private:
            int recvfrom = 0;
            if (iglobal <= fny[0])
            {
-                real_t wi = (iglobal == fny[0]) ? 0.5 : 1.0;
+                real_t wi = (iglobal == fny[0]) ? 0.0 : 1.0;

                recvfrom = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size);
                MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal,
@ -693,7 +701,7 @@ private:

                for (size_t j = 0; j < nf[1]; ++j)
                {
-                    real_t wj = (j == fny[1]) ? 0.5 : 1.0;
+                    real_t wj = (j == fny[1]) ? 0.0 : 1.0;
                    if (j <= fny[1])
                    {
                        size_t jp = j;
@ -701,21 +709,22 @@ private:
                        {
                            if (typeid(data_t) == typeid(real_t))
                            {
-                                real_t w = wi * wj;
+                                real_t wk = (k == fny[2]) ? 0.0 : 1.0;
+                                real_t w = wi * wj * wk;
                                fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
                            }
                            else
                            {
-                                real_t wk = (k == fny[2]) ? 0.5 : 1.0;
+                                real_t wk = (k == fny[2]) ? 0.0 : 1.0;
                                real_t w = wi * wj * wk;
-                                if (k <= fny[2])
+                                if (k < fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
-                                if (k >= fny[2])
+                                if (k > fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
-                                if (w < 1.0)
-                                {
-                                    fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
-                                }
+                                // if (w < 1.0)
+                                // {
+                                //     fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
+                                // }
                            }
                        }
                    }
@ -726,21 +735,22 @@ private:
                        {
                            if (typeid(data_t) == typeid(real_t))
                            {
-                                real_t w = wi * wj;
+                                real_t wk = (k == fny[2]) ? 0.0 : 1.0;
+                                real_t w = wi * wj * wk;
                                fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
                            }
                            else
                            {
-                                real_t wk = (k == fny[2]) ? 0.5 : 1.0;
+                                real_t wk = (k == fny[2]) ? 0.0 : 1.0;
                                real_t w = wi * wj * wk;
-                                if (k <= fny[2])
+                                if (k < fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
-                                if (k >= fny[2])
+                                if (k > fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
-                                if (w < 1.0)
-                                {
-                                    fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
-                                }
+                                // if (w < 1.0)
+                                // {
+                                //     fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
+                                // }
                            }
                        }
                    }
@ -748,7 +758,7 @@ private:
            }
            if (iglobal >= fny[0])
            {
-                real_t wi = (iglobal == fny[0]) ? 0.5 : 1.0;
+                real_t wi = (iglobal == fny[0]) ? 0.0 : 1.0;

                recvfrom = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size);
                MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom,
@ -756,29 +766,26 @@ private:

                for (size_t j = 0; j < nf[1]; ++j)
                {
-                    real_t wj = (j == fny[1]) ? 0.5 : 1.0;
+                    real_t wj = (j == fny[1]) ? 0.0 : 1.0;
                    if (j <= fny[1])
                    {
                        size_t jp = j;
                        for (size_t k = 0; k < nf[2]; ++k)
                        {
+                            const real_t wk = (k == fny[2]) ? 0.0 : 1.0;
+                            const real_t w = wi * wj * wk;
                            if (typeid(data_t) == typeid(real_t))
                            {
-                                real_t w = wi * wj;
+                                real_t wk = (k == fny[2]) ? 0.0 : 1.0;
+                                real_t w = wi * wj * wk;
                                fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
                            }
                            else
                            {
-                                real_t wk = (k == fny[2]) ? 0.5 : 1.0;
-                                real_t w = wi * wj * wk;
-                                if (k <= fny[2])
+                                if (k < fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
-                                if (k >= fny[2])
+                                if (k > fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
-                                if (w < 1.0)
-                                {
-                                    fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
-                                }
                            }
                        }
                    }
@ -787,23 +794,18 @@ private:
                        size_t jp = j + fny[1];
                        for (size_t k = 0; k < nf[2]; ++k)
                        {
+                            const real_t wk = (k == fny[2]) ? 0.0 : 1.0;
+                            const real_t w = wi * wj * wk;
                            if (typeid(data_t) == typeid(real_t))
                            {
-                                real_t w = wi * wj;
                                fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
                            }
                            else
                            {
-                                real_t wk = (k == fny[2]) ? 0.5 : 1.0;
-                                real_t w = wi * wj * wk;
-                                if (k <= fny[2])
+                                if (k < fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
-                                if (k >= fny[2])
+                                if (k > fny[2])
                                    fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
-                                if (w < 1.0)
-                                {
-                                    fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
-                                }
                            }
                        }
                    }
@ -811,8 +813,8 @@ private:
            }
        }

-//... copy data back
-#pragma omp parallel for
+        //... copy data back
+        #pragma omp parallel for
        for (size_t i = 0; i < fbuf_->ntot_; ++i)
        {
            output_op(i, (*fbuf_)[i]);
@ -831,7 +833,7 @@ private:

        MPI_Barrier(MPI_COMM_WORLD);

-        csoca::dlog.Print("[MPI] Completed gather for convolution, took %fs", get_wtime() - tstart);
+        music::dlog.Print("[MPI] Completed gather for convolution, took %fs", get_wtime() - tstart);

 #endif /// end of ifdef/ifndef USE_MPI //////////////////////////////////////////////////////////////
    }
--- a/include/cosmology_calculator.hh
+++ b/include/cosmology_calculator.hh
@ -1,25 +1,43 @@
 #pragma once

 #include <array>
+#include <vec.hh>

 #include <cosmology_parameters.hh>
+#include <physical_constants.hh>
 #include <transfer_function_plugin.hh>
+#include <math/ode_integrate.hh>
 #include <logger.hh>

+#include <math/interpolate.hh>
+
 #include <gsl/gsl_integration.h>
+// #include <gsl/gsl_spline.h>
 #include <gsl/gsl_errno.h>

+namespace cosmology
+{
+
 /*!
- * @class CosmologyCalculator
+ * @class cosmology::calculator
 * @brief provides functions to compute cosmological quantities
 *
 * This class provides member functions to compute cosmological quantities
 * related to the Friedmann equations and linear perturbation theory
 */
-class CosmologyCalculator
+class calculator
 {
+public:
+    //! data structure to store cosmological parameters
+    cosmology::parameters cosmo_param_;
+
+    //! pointer to an instance of a transfer function plugin
+    std::unique_ptr<TransferFunction_plugin> transfer_function_;
+
 private:
-    static constexpr double REL_PRECISION = 1e-5;
+    static constexpr double REL_PRECISION = 1e-10;
+    interpolated_function_1d<true,true,false> D_of_a_, f_of_a_, a_of_D_;
+    double Dnow_, Dplus_start_, Dplus_target_, astart_, atarget_;

    real_t integrate(double (*func)(double x, void *params), double a, double b, void *params) const
    {
@ -39,167 +57,207 @@ private:
        gsl_set_error_handler(NULL);

        if (error / result > REL_PRECISION)
-            csoca::wlog << "no convergence in function 'integrate', rel. error=" << error / result << std::endl;
+            music::wlog << "no convergence in function 'integrate', rel. error=" << error / result << std::endl;

        return (real_t)result;
    }

+    void compute_growth( std::vector<double>& tab_a, std::vector<double>& tab_D, std::vector<double>& tab_f )
+    {
+        using v_t = vec_t<3, double>;
+
+        // set ICs
+        const double a0 = 1e-10;
+        const double D0 = a0;
+        const double Dprime0 = 2.0 * D0 * H_of_a(a0) / std::pow(phys_const::c_SI, 2);
+        const double t0 = 1.0 / (a0 * H_of_a(a0));
+
+        v_t y0({a0, D0, Dprime0});
+
+        // set up integration
+        double dt = 1e-9;
+        double dtdid, dtnext;
+        const double amax = 2.0;
+
+        v_t yy(y0);
+        double t = t0;
+        const double eps = 1e-10;
+
+        while (yy[0] < amax)
+        {
+            // RHS of ODEs
+            auto rhs = [&](double t, v_t y) -> v_t {
+                auto a = y[0];
+                auto D = y[1];
+                auto Dprime = y[2];
+                v_t dy;
+                // da/dtau = a^2 H(a)
+                dy[0] = a * a * H_of_a(a);
+                // d D/dtau
+                dy[1] = Dprime;
+                // d^2 D / dtau^2
+                dy[2] = -a * H_of_a(a) * Dprime + 3.0 / 2.0 * cosmo_param_.Omega_m * std::pow(cosmo_param_.H0, 2) * D / a;
+                return dy;
+            };
+
+            // scale by predicted value to get approx. constant fractional errors
+            v_t yyscale = yy.abs() + dt * rhs(t, yy).abs();
+            
+            // call integrator
+            ode_integrate::rk_step_qs(dt, t, yy, yyscale, rhs, eps, dtdid, dtnext);
+
+            tab_a.push_back(yy[0]);
+            tab_D.push_back(yy[1]);
+            tab_f.push_back(yy[2]);
+
+            dt = dtnext;
+        }
+
+        // compute f, before we stored here D'
+        for (size_t i = 0; i < tab_a.size(); ++i)
+        {
+            tab_f[i] = tab_f[i] / (tab_a[i] * H_of_a(tab_a[i]) * tab_D[i]);
+            tab_D[i] = tab_D[i];
+            tab_a[i] = tab_a[i];
+        }
+    }
+
 public:
-    //! data structure to store cosmological parameters
-    CosmologyParameters cosmo_param_;
-
-    //! pointer to an instance of a transfer function plugin
-    //TransferFunction_plugin *ptransfer_fun_;
-    std::unique_ptr<TransferFunction_plugin> transfer_function_;
-
-
+    calculator() = delete;
+    calculator(const calculator& c) = delete;
    //! constructor for a cosmology calculator object
    /*!
 	 * @param acosmo a cosmological parameters structure
 	 * @param pTransferFunction pointer to an instance of a transfer function object
 	 */

-    explicit CosmologyCalculator(ConfigFile &cf)
-    : cosmo_param_(cf)
+    explicit calculator(config_file &cf)
+        : cosmo_param_(cf), astart_( 1.0/(1.0+cf.get_value<double>("setup","zstart")) ),
+            atarget_( 1.0/(1.0+cf.get_value_safe<double>("cosmology","ztarget",1./astart_-1.)))
    {
+        // pre-compute growth factors and store for interpolation
+        std::vector<double> tab_a, tab_D, tab_f;
+        this->compute_growth(tab_a, tab_D, tab_f);
+        D_of_a_.set_data(tab_a,tab_D);
+        f_of_a_.set_data(tab_a,tab_f);
+        a_of_D_.set_data(tab_D,tab_a);
+        Dnow_ = D_of_a_(1.0);
+
+        Dplus_start_ = D_of_a_( astart_ ) / Dnow_;
+        Dplus_target_ = D_of_a_( atarget_ ) / Dnow_;
+
+        // set up transfer functions and compute normalisation
        transfer_function_ = std::move(select_TransferFunction_plugin(cf));
        transfer_function_->intialise();
-        cosmo_param_.pnorm = this->ComputePNorm();
+        if( !transfer_function_->tf_isnormalised_ )
+            cosmo_param_.pnorm = this->compute_pnorm_from_sigma8();
+        else{
+            cosmo_param_.pnorm = 1.0/Dplus_target_/Dplus_target_;
+            auto sigma8 = this->compute_sigma8();
+            music::ilog << "Measured sigma_8 for given PS normalisation is " <<  sigma8 << std::endl;
+        }
        cosmo_param_.sqrtpnorm = std::sqrt(cosmo_param_.pnorm);
-        csoca::ilog << std::setw(32) << std::left << "TF supports distinct CDM+baryons" << " : " << (transfer_function_->tf_is_distinct()? "yes" : "no") << std::endl;
-        csoca::ilog << std::setw(32) << std::left << "TF maximum wave number" << " : " << transfer_function_->get_kmax() << " h/Mpc" << std::endl;
+
+        music::ilog << std::setw(32) << std::left << "TF supports distinct CDM+baryons"
+                    << " : " << (transfer_function_->tf_is_distinct() ? "yes" : "no") << std::endl;
+        music::ilog << std::setw(32) << std::left << "TF maximum wave number"
+                    << " : " << transfer_function_->get_kmax() << " h/Mpc" << std::endl;
+    }
+
+    ~calculator()
+    {
    }

    //! Write out a correctly scaled power spectrum at time a
-    void WritePowerspectrum( real_t a, std::string fname ) const
+    void write_powerspectrum(real_t a, std::string fname) const
    {
-        const real_t Dplus0 = this->CalcGrowthFactor(a) / this->CalcGrowthFactor(1.0);
+        // const real_t Dplus0 = this->get_growth_factor(a);

-        if( CONFIG::MPI_task_rank==0 )
+        if (CONFIG::MPI_task_rank == 0)
        {
-            double kmin = std::max(1e-4,transfer_function_->get_kmin());
+            double kmin = std::max(1e-4, transfer_function_->get_kmin());

            // write power spectrum to a file
            std::ofstream ofs(fname.c_str());
-            std::stringstream ss; ss << " (a=" << a <<")";
+            std::stringstream ss;
+            ss << " ,ap=" << a << "";
            ofs << "# " << std::setw(18) << "k [h/Mpc]"
-                        << std::setw(20) << ("P_dtot(k)"+ss.str()) 
-                        << std::setw(20) << ("P_dcdm(k)"+ss.str())
-                        << std::setw(20) << ("P_dbar(k)"+ss.str())
-                        << std::setw(20) << ("P_dtot(K) (a=1)")
-                        << std::setw(20) << ("P_tcdm(k)"+ss.str()) 
-                        << std::setw(20) << ("P_tbar(k)"+ss.str())
-                        << std::endl;
-            for( double k=kmin; k<transfer_function_->get_kmax(); k*=1.05 ){
+                << std::setw(20) << ("P_dtot(k,a=ap)")
+                << std::setw(20) << ("P_dcdm(k,a=ap)")
+                << std::setw(20) << ("P_dbar(k,a=ap)")
+                << std::setw(20) << ("P_tcdm(k,a=ap)")
+                << std::setw(20) << ("P_tbar(k,a=ap)")
+                << std::setw(20) << ("P_dtot(k,a=1)")
+                << std::setw(20) << ("P_dcdm(k,a=1)")
+                << std::setw(20) << ("P_dbar(k,a=1)")
+                << std::setw(20) << ("P_tcdm(k,a=1)")
+                << std::setw(20) << ("P_tbar(k,a=1)")
+                << std::setw(20) << ("P_dtot(K,a=1)")
+                << std::endl;
+            for (double k = kmin; k < transfer_function_->get_kmax(); k *= 1.05)
+            {
                ofs << std::setw(20) << std::setprecision(10) << k
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, total) * Dplus0, 2.0)
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, cdm) * Dplus0, 2.0)
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, baryon) * Dplus0, 2.0)
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, total), 2.0)
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, vcdm) * Dplus0, 2.0)
-                    << std::setw(20) << std::setprecision(10) << std::pow(this->GetAmplitude(k, vbaryon) * Dplus0, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, total)*Dplus_start_, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, cdm)*Dplus_start_, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, baryon)*Dplus_start_, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, vcdm)*Dplus_start_, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, vbaryon)*Dplus_start_, 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, total0), 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, cdm0), 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, baryon0), 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, vcdm0), 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, vbaryon0), 2.0)
+                    << std::setw(20) << std::setprecision(10) << std::pow(this->get_amplitude(k, vtotal), 2.0)
                    << std::endl;
            }
        }
-
-        csoca::ilog << "Wrote power spectrum at a=" << a << " to file \'" << fname << "\'" << std::endl;
+        music::ilog << "Wrote power spectrum at a=" << a << " to file \'" << fname << "\'" << std::endl;
    }

-    const CosmologyParameters &GetParams(void) const
+    const cosmology::parameters &get_parameters(void) const noexcept
    {
        return cosmo_param_;
    }

-    //! returns the amplitude of amplitude of the power spectrum
-    /*!
-	 * @param k the wave number in h/Mpc
-	 * @param a the expansion factor of the universe
-	 * @returns power spectrum amplitude for wave number k at time a
-	 */
-    inline real_t Power(real_t k, real_t a)
+    //! return the value of the Hubble function H(a) = dloga/dt 
+    inline double H_of_a(double a) const noexcept
    {
-        real_t Dplus = CalcGrowthFactor(a);
-        real_t DplusOne = CalcGrowthFactor(1.0);
-        real_t pNorm = ComputePNorm();
-        Dplus /= DplusOne;
-        DplusOne = 1.0;
-        real_t scale = Dplus / DplusOne;
-        return pNorm * scale * scale * TransferSq(k) * pow((double)k, (double)cosmo_param_.nspect);
+        double HH2 = 0.0;
+        HH2 += cosmo_param_.Omega_r / (a * a * a * a);
+        HH2 += cosmo_param_.Omega_m / (a * a * a);
+        HH2 += cosmo_param_.Omega_k / (a * a);
+        HH2 += cosmo_param_.Omega_DE * std::pow(a, -3. * (1. + cosmo_param_.w_0 + cosmo_param_.w_a)) * exp(-3. * (1.0 - a) * cosmo_param_.w_a);
+        return cosmo_param_.H0 * std::sqrt(HH2);
    }

-    inline static double H_of_a(double a, void *Params)
+    //! Computes the linear theory growth factor D+, normalised to D+(a=1)=1
+    real_t get_growth_factor(real_t a) const noexcept
    {
-        CosmologyParameters *cosm = (CosmologyParameters *)Params;
-        double a2 = a * a;
-        double Ha = sqrt(cosm->Omega_m / (a2 * a) + cosm->Omega_k / a2 + cosm->Omega_DE * pow(a, -3. * (1. + cosm->w_0 + cosm->w_a)) * exp(-3. * (1.0 - a) * cosm->w_a));
-        return Ha;
+        return D_of_a_(a) / Dnow_;
    }

-    inline static double Hprime_of_a(double a, void *Params) 
+    //! Computes the inverse of get_growth_factor
+    real_t get_a( real_t Dplus ) const noexcept
    {
-        CosmologyParameters *cosm = (CosmologyParameters *)Params;
-        double a2 = a * a;
-        double H = H_of_a(a, Params);
-        double Hprime = 1 / (a * H) * (-1.5 * cosm->Omega_m / (a2 * a) - cosm->Omega_k / a2 - 1.5 * cosm->Omega_DE * pow(a, -3. * (1. + cosm->w_0 + cosm->w_a)) * exp(-3. * (1.0 - a) * cosm->w_a) * (1. + cosm->w_0 + (1. - a) * cosm->w_a));
-        return Hprime;
+        return a_of_D_( Dplus * Dnow_ );
    }

-    //! Integrand used by function CalcGrowthFactor to determine the linear growth factor D+
-    inline static double GrowthIntegrand(double a, void *Params) 
+    //! Computes the linear theory growth rate f
+    /*! Function computes (by interpolating on precalculated table)
+     *   f = dlog D+ / dlog a
+     */
+    real_t get_f(real_t a) const noexcept
    {
-        double Ha = a * H_of_a(a, Params);
-        return 2.5 / (Ha * Ha * Ha);
-    }
-
-    //! integrand function for Calc_fPeebles
-	/*!
-	 * @sa Calc_fPeebles
-	 */
-	inline static double fIntegrand( double a, void *Params )
-	{
-		CosmologyParameters *cosm = (CosmologyParameters *)Params;
-		double y = cosm->Omega_m*(1.0/a-1.0) + cosm->Omega_DE*(a*a-1.0) + 1.0;
-		return 1.0/pow(y,1.5);
-	}
-	
-	//! calculates d log D+/d log a
-	/*! this version follows the Peebles (TBD: add citation)
-	 *  formula to compute Bertschinger's vfact
-	 */
-	inline real_t CalcGrowthRate( real_t a )
-	{
-        #warning CalcGrowthRate is only correct if dark energy is a cosmological constant, need to upgrade calculator...
-		real_t y = cosmo_param_.Omega_m*(1.0/a-1.0) + cosmo_param_.Omega_DE*(a*a-1.0) + 1.0;
-		real_t fact = integrate( &fIntegrand, 1e-6, a, (void*)&cosmo_param_ );
-		return (cosmo_param_.Omega_DE*a*a-0.5*cosmo_param_.Omega_m/a)/y - 1.0 + a*fIntegrand(a,(void*)&cosmo_param_)/fact;
-	}
-
-    //! Computes the linear theory growth factor D+
-    /*! Function integrates over member function GrowthIntegrand and computes
-    *                      /a
-    *   D+(a) = 5/2 H(a) * |  [a'^3 * H(a')^3]^(-1) da'
-    *                      /0
-    */
-    real_t CalcGrowthFactor(real_t a) const
-    {
-        real_t integral = integrate(&GrowthIntegrand, 0.0, a, (void *)&cosmo_param_);
-        return H_of_a(a, (void *)&cosmo_param_) * integral;
+        return f_of_a_(a);
    }

    //! Compute the factor relating particle displacement and velocity
    /*! Function computes
-    *
-    *  vfac = a^2 * H(a) * dlogD+ / d log a = a^2 * H'(a) + 5/2 * [ a * D+(a) * H(a) ]^(-1)
-    *
-    */
-    real_t CalcVFact(real_t a) const
+     *  vfac = a * (H(a)/h) * dlogD+ / dlog a 
+     */
+    real_t get_vfact(real_t a) const noexcept
    {
-        real_t Dp = CalcGrowthFactor(a);
-        real_t H = H_of_a(a, (void *)&cosmo_param_);
-        real_t Hp = Hprime_of_a(a, (void *)&cosmo_param_);
-        real_t a2 = a * a;
-
-        return (a2 * Hp + 2.5 / (a * Dp * H)) * 100.0;
+        return f_of_a_(a) * a * H_of_a(a) / cosmo_param_.h;
    }

    //! Integrand for the sigma_8 normalization of the power spectrum
@ -210,7 +268,7 @@ public:
        if (k <= 0.0)
            return 0.0f;

-        CosmologyCalculator *pcc = reinterpret_cast<CosmologyCalculator*>(pParams);
+        cosmology::calculator *pcc = reinterpret_cast<cosmology::calculator *>(pParams);

        double x = k * 8.0;
        double w = 3.0 * (sin(x) - x * cos(x)) / (x * x * x);
@ -229,7 +287,7 @@ public:
        if (k <= 0.0)
            return 0.0f;

-        CosmologyCalculator *pcc = reinterpret_cast<CosmologyCalculator*>(pParams);
+        cosmology::calculator *pcc = reinterpret_cast<cosmology::calculator *>(pParams);

        double x = k * 8.0;
        double w = 3.0 * (sin(x) - x * cos(x)) / (x * x * x);
@ -240,24 +298,12 @@ public:
        return k * k * w * w * pow((double)k, (double)nspect) * tf * tf;
    }

-    //! Computes the square of the transfer function
-    /*! Function evaluates the supplied transfer function ptransfer_fun_
-	 * and returns the square of its value at wave number k
-	 * @param k wave number at which to evaluate the transfer function
-	 */
-    inline real_t TransferSq(real_t k) const
-    {
-        //.. parameter supplied transfer function
-        real_t tf1 = transfer_function_->compute(k, total);
-        return tf1 * tf1;
-    }
-
    //! Computes the amplitude of a mode from the power spectrum
    /*! Function evaluates the supplied transfer function ptransfer_fun_
 	 * and returns the amplitude of fluctuations at wave number k at z=0
 	 * @param k wave number at which to evaluate
 	 */
-    inline real_t GetAmplitude(real_t k, tf_type type) const
+    inline real_t get_amplitude(real_t k, tf_type type) const
    {
        return std::pow(k, 0.5 * cosmo_param_.nspect) * transfer_function_->compute(k, type) * cosmo_param_.sqrtpnorm;
    }
@ -267,18 +313,30 @@ public:
 	 * integrates the power spectrum to fix the normalization to that given
 	 * by the sigma_8 parameter
 	 */
-    real_t ComputePNorm(void)
+    real_t compute_sigma8(void)
    {
        real_t sigma0, kmin, kmax;
        kmax = transfer_function_->get_kmax();
        kmin = transfer_function_->get_kmin();

        if (!transfer_function_->tf_has_total0())
-            sigma0 = 4.0 * M_PI * integrate(&dSigma8, (double)kmin, (double)kmax, this );
-        else
-            sigma0 = 4.0 * M_PI * integrate(&dSigma8_0, (double)kmin, (double)kmax, this );
+            sigma0 = 4.0 * M_PI * integrate(&dSigma8, (double)kmin, (double)kmax, this);
+        else{
+            sigma0 = 4.0 * M_PI * integrate(&dSigma8_0, (double)kmin, (double)kmax, this);
+        }

-        return cosmo_param_.sigma8 * cosmo_param_.sigma8 / sigma0;
+        return std::sqrt(sigma0);
+    }
+
+    //! Computes the normalization for the power spectrum
+    /*!
+	 * integrates the power spectrum to fix the normalization to that given
+	 * by the sigma_8 parameter
+	 */
+    real_t compute_pnorm_from_sigma8(void)
+    {
+        auto measured_sigma8 = this->compute_sigma8();
+        return cosmo_param_.sigma8 * cosmo_param_.sigma8 / (measured_sigma8  * measured_sigma8);
    }
 };

@ -294,3 +352,5 @@ inline double jeans_sound_speed(double rho, double mass)
    const double G = 6.67e-8;
    return pow(6.0 * mass / M_PI * sqrt(rho) * pow(G, 1.5), 1.0 / 3.0);
 }
+
+} // namespace cosmology
--- a/include/cosmology_parameters.hh
+++ b/include/cosmology_parameters.hh
@ -1,10 +1,21 @@
 #pragma once
+/*******************************************************************************\
+ cosmology_parameters.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
 
+ CHANGELOG (only majors, for details see repo):
+    06/2019 - Oliver Hahn - first implementation
+\*******************************************************************************/
+
+#include <physical_constants.hh>
 #include <config_file.hh>

-//! structure for cosmological parameters
-struct CosmologyParameters
+namespace cosmology
 {
+//! structure for cosmological parameters
+struct parameters
+{
+
    double
        Omega_m,  //!< baryon+dark matter density
        Omega_b,  //!< baryon matter density
@ -12,38 +23,88 @@ struct CosmologyParameters
        Omega_r,  //!< photon + relativistic particle density
        Omega_k,  //!< curvature density
        H0,       //!< Hubble constant in km/s/Mpc
+        h,        //!< hubble parameter
        nspect,   //!< long-wave spectral index (scale free is nspect=1)
        sigma8,   //!< power spectrum normalization
+        Tcmb,     //!< CMB temperature (used to set Omega_r)
+        Neff,     //!< effective number of neutrino species (used to set Omega_r)
        w_0,      //!< dark energy equation of state parameter 1: w = w0 + a * wa
        w_a,      //!< dark energy equation of state parameter 2: w = w0 + a * wa

        // below are helpers to store additional information
-        dplus, //!< linear perturbation growth factor
-        pnorm, //!< actual power spectrum normalisation factor
+        dplus,     //!< linear perturbation growth factor
+        f,         //!< growth factor logarithmic derivative
+        pnorm,     //!< actual power spectrum normalisation factor
        sqrtpnorm, //!< sqrt of power spectrum normalisation factor
-        vfact; //!< velocity<->displacement conversion factor in Zel'dovich approx.
+        vfact;     //!< velocity<->displacement conversion factor in Zel'dovich approx.

-    explicit CosmologyParameters(ConfigFile cf)
+    parameters() = delete;
+    
+    parameters( const parameters& ) = default;
+    
+    explicit parameters(config_file cf)
    {
-        Omega_b = cf.GetValue<double>("cosmology", "Omega_b");
-        Omega_m = cf.GetValue<double>("cosmology", "Omega_m");
-        Omega_DE = cf.GetValue<double>("cosmology", "Omega_L");
-        w_0 = cf.GetValueSafe<double>("cosmology", "w0", -1.0);
-        w_a = cf.GetValueSafe<double>("cosmology", "wa", 0.0);
+        H0 = cf.get_value<double>("cosmology", "H0");
+        h  = H0 / 100.0;

-        Omega_r = cf.GetValueSafe<double>("cosmology", "Omega_r", 0.0); // no longer default to nonzero (8.3e-5)
+        nspect = cf.get_value<double>("cosmology", "nspec");
+
+        Omega_b = cf.get_value<double>("cosmology", "Omega_b");
+
+        Omega_m = cf.get_value<double>("cosmology", "Omega_m");
+
+        Omega_DE = cf.get_value<double>("cosmology", "Omega_L");
+
+        w_0 = cf.get_value_safe<double>("cosmology", "w0", -1.0);
+
+        w_a = cf.get_value_safe<double>("cosmology", "wa", 0.0);
+
+        Tcmb = cf.get_value_safe<double>("cosmology", "Tcmb", 2.7255);
+
+        Neff = cf.get_value_safe<double>("cosmology", "Neff", 3.046);
+
+        sigma8 = cf.get_value<double>("cosmology", "sigma_8");
+
+        // calculate energy density in ultrarelativistic species from Tcmb and Neff
+        double Omega_gamma = 4 * phys_const::sigma_SI / std::pow(phys_const::c_SI, 3) * std::pow(Tcmb, 4.0) / phys_const::rhocrit_h2_SI / (h * h);
+        double Omega_nu = Neff * Omega_gamma * 7. / 8. * std::pow(4. / 11., 4. / 3.);
+        Omega_r = Omega_gamma + Omega_nu;
+
+        if (cf.get_value_safe<bool>("cosmology", "ZeroRadiation", false))
+        {
+            Omega_r = 0.0;
+        }
+#if 1
+        // assume zero curvature, take difference from dark energy
+        Omega_DE += 1.0 - Omega_m - Omega_DE - Omega_r;
+        Omega_k  = 0.0;
+#else
+        // allow for curvature 
        Omega_k = 1.0 - Omega_m - Omega_DE - Omega_r;
-
-        H0 = cf.GetValue<double>("cosmology", "H0");
-        sigma8 = cf.GetValue<double>("cosmology", "sigma_8");
-        nspect = cf.GetValue<double>("cosmology", "nspec");
+#endif

        dplus = 0.0;
        pnorm = 0.0;
        vfact = 0.0;
+
+        music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        music::ilog << "Cosmological parameters are: " << std::endl;
+        music::ilog << " H0       = " << std::setw(16) << H0          << "sigma_8  = " << std::setw(16) << sigma8 << std::endl;
+        music::ilog << " Omega_c  = " << std::setw(16) << Omega_m-Omega_b << "Omega_b  = " << std::setw(16) << Omega_b << std::endl;
+        if (!cf.get_value_safe<bool>("cosmology", "ZeroRadiation", false)){
+            music::ilog << " Omega_g  = " << std::setw(16) << Omega_gamma << "Omega_nu = " << std::setw(16) << Omega_nu << std::endl;
+        }else{
+            music::ilog << " Omega_r  = " << std::setw(16) << Omega_r << std::endl;
+        }
+        music::ilog << " Omega_DE = " << std::setw(16) << Omega_DE    << "nspect   = " << std::setw(16) << nspect << std::endl;
+        music::ilog << " w0       = " << std::setw(16) << w_0         << "w_a      = " << std::setw(16) << w_a << std::endl;
+
+        if( Omega_r > 0.0 )
+        {
+            music::wlog << "Radiation enabled, using Omega_r=" << Omega_r << " internally."<< std::endl;
+            music::wlog << "Make sure your sim code supports this..." << std::endl;
+        }
    }

-    CosmologyParameters(void)
-    {
-    }
 };
+} // namespace cosmology
--- a/include/general.hh
+++ b/include/general.hh
@ -7,24 +7,49 @@

 #if defined(USE_MPI)
 #include <mpi.h>
-  #include <fftw3-mpi.h>
+#include <fftw3-mpi.h>
 #else
-  #include <fftw3.h>
+#include <fftw3.h>
 #endif

-#ifdef USE_SINGLEPRECISION
+#include <config_file.hh>
+
+#define _unused(x) ((void)(x))
+
+// include CMake controlled configuration settings
+#include <cmake_config.hh>
+
+#if defined(USE_PRECISION_FLOAT)
 using real_t = float;
 using complex_t = fftwf_complex;
 #define FFTW_PREFIX fftwf
-#else
+#elif defined(USE_PRECISION_DOUBLE)
 using real_t = double;
 using complex_t = fftw_complex;
 #define FFTW_PREFIX fftw
+#elif defined(USE_PRECISION_LONGDOUBLE)
+using real_t = long double;
+using complex_t = fftwl_complex;
+#define FFTW_PREFIX fftwl
 #endif

-enum class fluid_component { density, vx, vy, vz, dx, dy, dz };
-enum class cosmo_species { dm, baryon, neutrino };
-extern std::map<cosmo_species,std::string> cosmo_species_name;
+enum class fluid_component
+{
+  density,
+  vx,
+  vy,
+  vz,
+  dx,
+  dy,
+  dz
+};
+enum class cosmo_species
+{
+  dm,
+  baryon,
+  neutrino
+};
+extern std::map<cosmo_species, std::string> cosmo_species_name;

 using ccomplex_t = std::complex<real_t>;

@ -45,52 +70,64 @@ using fftw_plan_t = FFTW_GEN_NAME(FFTW_PREFIX, plan);
 #if defined(USE_MPI)
 inline double get_wtime()
 {
-    return MPI_Wtime();
+  return MPI_Wtime();
 }

-inline int MPI_Get_rank( void ){
-    int rank, ret;
-    ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	assert( ret==MPI_SUCCESS );
-    return rank;
-}
-
-inline int MPI_Get_size( void ){
-    int size, ret;
-    ret = MPI_Comm_size(MPI_COMM_WORLD, &size);
-	assert( ret==MPI_SUCCESS );
-    return size;
-}
-
-template<typename T>
-MPI_Datatype GetMPIDatatype( void )
+namespace MPI
 {
-  if( typeid(T) == typeid(std::complex<float>) )
-    return MPI_COMPLEX;

-  if( typeid(T) == typeid(std::complex<double>) )
-    return MPI_DOUBLE_COMPLEX;
+inline int get_rank(void)
+{
+  int rank, ret;
+  ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  assert(ret == MPI_SUCCESS);
+  _unused(ret);
+  return rank;
+}

-  if( typeid(T) == typeid(int) )
+inline int get_size(void)
+{
+  int size, ret;
+  ret = MPI_Comm_size(MPI_COMM_WORLD, &size);
+  assert(ret == MPI_SUCCESS);
+  _unused(ret);
+  return size;
+}
+
+template <typename T>
+inline MPI_Datatype get_datatype(void)
+{
+  if (typeid(T) == typeid(std::complex<float>))
+    return MPI_C_FLOAT_COMPLEX;
+
+  if (typeid(T) == typeid(std::complex<double>))
+    return MPI_C_DOUBLE_COMPLEX;
+
+  if (typeid(T) == typeid(std::complex<long double>))
+    return MPI_C_LONG_DOUBLE_COMPLEX;
+
+  if (typeid(T) == typeid(int))
    return MPI_INT;

-  if( typeid(T) == typeid(unsigned) )
+  if (typeid(T) == typeid(unsigned))
    return MPI_UNSIGNED;

-  if( typeid(T) == typeid(float) )
+  if (typeid(T) == typeid(float))
    return MPI_FLOAT;

-  if( typeid(T) == typeid(double) )
+  if (typeid(T) == typeid(double))
    return MPI_DOUBLE;

-  if( typeid(T) == typeid(char) )
+  if (typeid(T) == typeid(long double))
+    return MPI_LONG_DOUBLE;
+
+  if (typeid(T) == typeid(char))
    return MPI_CHAR;

  abort();
-
 }

-inline std::string GetMPIversion( void )
+inline std::string get_version(void)
 {
  int len;
  char mpi_lib_ver[MPI_MAX_LIBRARY_VERSION_STRING];
@ -98,33 +135,31 @@ inline std::string GetMPIversion( void )
  MPI_Get_library_version(mpi_lib_ver, &len);
  return std::string(mpi_lib_ver);
 }
-
+} // namespace MPI

 #else
-  #if defined(_OPENMP)
-    #include <omp.h>
-    inline double get_wtime()
-    {
-      return omp_get_wtime();
-    }
-  #else
-    #include <ctime>
-    inline double get_wtime()
-    {
-      return std::clock() / double(CLOCKS_PER_SEC);
-    }
-  #endif
+#if defined(_OPENMP)
+#include <omp.h>
+inline double get_wtime()
+{
+  return omp_get_wtime();
+}
+#else
+#include <ctime>
+inline double get_wtime()
+{
+  return std::clock() / double(CLOCKS_PER_SEC);
+}
+#endif
 #endif

-inline void multitask_sync_barrier( void )
+inline void multitask_sync_barrier(void)
 {
 #if defined(USE_MPI)
-  MPI_Barrier( MPI_COMM_WORLD );
+  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 }

-
-
 namespace CONFIG
 {
 extern int MPI_thread_support;
@ -135,13 +170,3 @@ extern bool MPI_threads_ok;
 extern bool FFTW_threads_ok;
 extern int num_threads;
 } // namespace CONFIG
-
-
-// These variables are autogenerated and compiled
-// into the library by the version.cmake script
-extern "C"
-{
-    extern const char* GIT_TAG;
-    extern const char* GIT_REV;
-    extern const char* GIT_BRANCH;
-}
--- a/include/grid_fft.hh
+++ b/include/grid_fft.hh
@ -4,7 +4,7 @@
 #include <array>
 #include <vector>

-#include <vec3.hh>
+#include <math/vec3.hh>
 #include <general.hh>
 #include <bounding_box.hh>
 #include <typeinfo>
@ -16,22 +16,26 @@ enum space_t
 };


-template <typename data_t>
+#ifdef USE_MPI
+template <typename data_t_, bool bdistributed=true>
+#else
+template <typename data_t_, bool bdistributed=false>
+#endif
 class Grid_FFT
 {
+public:
+    using data_t = data_t_;
+    static constexpr bool is_distributed_trait{bdistributed};
+
 protected:
-#if defined(USE_MPI)
-    const MPI_Datatype MPI_data_t_type = (typeid(data_t) == typeid(double)) ? MPI_DOUBLE
-                                                                            : (typeid(data_t) == typeid(float)) ? MPI_FLOAT
-                                                                                                                : (typeid(data_t) == typeid(std::complex<float>)) ? MPI_COMPLEX
-                                                                                                                                                                  : (typeid(data_t) == typeid(std::complex<double>)) ? MPI_DOUBLE_COMPLEX : MPI_INT;
-#endif
+    using grid_fft_t = Grid_FFT<data_t,bdistributed>;
+    
 public:
    std::array<size_t, 3> n_, nhalf_;
    std::array<size_t, 4> sizes_;
    size_t npr_, npc_;
    size_t ntot_;
-    std::array<real_t, 3> length_, kfac_, dx_;
+    std::array<real_t, 3> length_, kfac_, kny_, dx_;

    space_t space_;
    data_t *data_;
@ -54,7 +58,7 @@ public:
    }

    // avoid implicit copying of data
-    Grid_FFT(const Grid_FFT<data_t> &g) = delete;
+    Grid_FFT(const grid_fft_t &g) = delete;

    ~Grid_FFT()
    {
@ -64,34 +68,48 @@ public:
        }
    }

-    const Grid_FFT<data_t> *get_grid(size_t ilevel) const { return this; }
+    const grid_fft_t *get_grid(size_t ilevel) const { return this; }
+
+    bool is_distributed( void ) const noexcept { return bdistributed; }

    void Setup();

+    //! return the number of data_t elements that we store in the container
+    size_t memsize( void ) const noexcept { return ntot_; }
+
    //! return the (local) size of dimension i
-    size_t size(size_t i) const { return sizes_[i]; }
+    size_t size(size_t i) const noexcept { assert(i<4); return sizes_[i]; }

    //! return the (global) size of dimension i
-    size_t global_size(size_t i) const { return n_[i]; }
+    size_t global_size(size_t i) const noexcept { assert(i<3); return n_[i]; }

    //! return locally stored number of elements of field
-    size_t local_size(void) const { return local_0_size_ * n_[1] * n_[2]; }
+    size_t local_size(void) const noexcept { return local_0_size_ * n_[1] * n_[2]; }

    //! return a bounding box of the global extent of the field
-    const bounding_box<size_t> &get_global_range(void) const
+    const bounding_box<size_t> &get_global_range(void) const noexcept
    {
        return global_range_;
    }

+    bool is_nyquist_mode( size_t i, size_t j, size_t k ) const
+    {
+        assert( this->space_ == kspace_id );
+        bool bres = (i+local_1_start_ == n_[1]/2);
+        bres |= (j == n_[0]/2);
+        bres |= (k == n_[2]/2);
+        return bres;
+    }
+
    //! set all field elements to zero
-    void zero()
+    void zero() noexcept
    {
 #pragma omp parallel for
        for (size_t i = 0; i < ntot_; ++i)
            data_[i] = 0.0;
    }

-    void copy_from(const Grid_FFT<data_t> &g)
+    void copy_from(const grid_fft_t &g)
    {
        // make sure the two fields are in the same space
        if (g.space_ != this->space_)
@ -113,49 +131,49 @@ public:
            data_[i] = g.data_[i];
    }

-    data_t &operator[](size_t i)
+    data_t &operator[](size_t i) noexcept
    {
        return data_[i];
    }

-    data_t &relem(size_t i, size_t j, size_t k)
+    data_t &relem(size_t i, size_t j, size_t k) noexcept 
    {
        size_t idx = (i * sizes_[1] + j) * sizes_[3] + k;
        return data_[idx];
    }

-    const data_t &relem(size_t i, size_t j, size_t k) const
+    const data_t &relem(size_t i, size_t j, size_t k) const noexcept
    {
        size_t idx = (i * sizes_[1] + j) * sizes_[3] + k;
        return data_[idx];
    }

-    ccomplex_t &kelem(size_t i, size_t j, size_t k)
+    ccomplex_t &kelem(size_t i, size_t j, size_t k) noexcept
    {
        size_t idx = (i * sizes_[1] + j) * sizes_[3] + k;
        return cdata_[idx];
    }

-    const ccomplex_t &kelem(size_t i, size_t j, size_t k) const
+    const ccomplex_t &kelem(size_t i, size_t j, size_t k) const noexcept
    {
        size_t idx = (i * sizes_[1] + j) * sizes_[3] + k;
        return cdata_[idx];
    }

-    ccomplex_t &kelem(size_t idx) { return cdata_[idx]; }
-    const ccomplex_t &kelem(size_t idx) const { return cdata_[idx]; }
-    data_t &relem(size_t idx) { return data_[idx]; }
-    const data_t &relem(size_t idx) const { return data_[idx]; }
+    ccomplex_t &kelem(size_t idx) noexcept { return cdata_[idx]; }
+    const ccomplex_t &kelem(size_t idx) const noexcept { return cdata_[idx]; }
+    data_t &relem(size_t idx) noexcept { return data_[idx]; }
+    const data_t &relem(size_t idx) const noexcept { return data_[idx]; }

-    size_t get_idx(size_t i, size_t j, size_t k) const
+    size_t get_idx(size_t i, size_t j, size_t k) const noexcept
    {
        return (i * sizes_[1] + j) * sizes_[3] + k;
    }

    template <typename ft>
-    vec3<ft> get_r(const size_t i, const size_t j, const size_t k) const
+    vec3_t<ft> get_r(const size_t i, const size_t j, const size_t k) const noexcept
    {
-        vec3<ft> rr;
+        vec3_t<ft> rr;

        rr[0] = real_t(i + local_0_start_) * dx_[0];
        rr[1] = real_t(j) * dx_[1];
@ -165,9 +183,9 @@ public:
    }

    template <typename ft>
-    vec3<ft> get_unit_r(const size_t i, const size_t j, const size_t k) const
+    vec3_t<ft> get_unit_r(const size_t i, const size_t j, const size_t k) const noexcept
    {
-        vec3<ft> rr;
+        vec3_t<ft> rr;

        rr[0] = real_t(i + local_0_start_) / real_t(n_[0]);
        rr[1] = real_t(j) / real_t(n_[1]);
@ -177,91 +195,155 @@ public:
    }

    template <typename ft>
-    vec3<ft> get_unit_r_staggered(const size_t i, const size_t j, const size_t k) const
+    vec3_t<ft> get_unit_r_shifted(const size_t i, const size_t j, const size_t k, const vec3_t<real_t> s) const noexcept
    {
-        vec3<ft> rr;
+        vec3_t<ft> rr;

-        rr[0] = (real_t(i + local_0_start_) + 0.5) / real_t(n_[0]);
-        rr[1] = (real_t(j) + 0.5) / real_t(n_[1]);
-        rr[2] = (real_t(k) + 0.5) / real_t(n_[2]);
+        rr[0] = (real_t(i + local_0_start_) + s.x) / real_t(n_[0]);
+        rr[1] = (real_t(j) + s.y) / real_t(n_[1]);
+        rr[2] = (real_t(k) + s.z) / real_t(n_[2]);

        return rr;
    }

-    template <typename ft>
-    vec3<ft> get_unit_r_shifted(const size_t i, const size_t j, const size_t k, double sx, double sy, double sz) const
+    vec3_t<size_t> get_cell_idx_3d(const size_t i, const size_t j, const size_t k) const noexcept
    {
-        vec3<ft> rr;
-
-        rr[0] = (real_t(i + local_0_start_) + sx) / real_t(n_[0]);
-        rr[1] = (real_t(j) + sy) / real_t(n_[1]);
-        rr[2] = (real_t(k) + sz) / real_t(n_[2]);
-
-        return rr;
+        return vec3_t<size_t>({i + local_0_start_, j, k});
    }

-    void cell_pos(int ilevel, size_t i, size_t j, size_t k, double *x) const
-    {
-        x[0] = double(i + local_0_start_) / size(0);
-        x[1] = double(j) / size(1);
-        x[2] = double(k) / size(2);
-    }
-
-    vec3<size_t> get_cell_idx_3d(const size_t i, const size_t j, const size_t k) const
-    {
-        return vec3<size_t>({i + local_0_start_, j, k});
-    }
-
-    size_t get_cell_idx_1d(const size_t i, const size_t j, const size_t k) const
+    size_t get_cell_idx_1d(const size_t i, const size_t j, const size_t k) const noexcept
    {
        return ((i + local_0_start_) * size(1) + j) * size(2) + k;
    }

-    size_t count_leaf_cells(int, int) const
+    //! deprecated function, was needed for old output plugin
+    size_t count_leaf_cells(int, int) const noexcept
    {
        return n_[0] * n_[1] * n_[2];
    }

-    real_t get_dx(int idim) const
+    real_t get_dx(int idim) const noexcept
    {
+        assert(idim<3&&idim>=0);
        return dx_[idim];
    }

-    const std::array<real_t, 3> &get_dx(void) const
+    const std::array<real_t, 3> &get_dx(void) const noexcept
    {
        return dx_;
    }

    template <typename ft>
-    vec3<ft> get_k(const size_t i, const size_t j, const size_t k) const
+    vec3_t<ft> get_k(const size_t i, const size_t j, const size_t k) const noexcept
    {
-        vec3<ft> kk;
-
-#if defined(USE_MPI)
-        auto ip = i + local_1_start_;
-        kk[0] = (real_t(j) - real_t(j > nhalf_[0]) * n_[0]) * kfac_[0];
-        kk[1] = (real_t(ip) - real_t(ip > nhalf_[1]) * n_[1]) * kfac_[1];
-#else
-        kk[0] = (real_t(i) - real_t(i > nhalf_[0]) * n_[0]) * kfac_[0];
-        kk[1] = (real_t(j) - real_t(j > nhalf_[1]) * n_[1]) * kfac_[1];
-#endif
+        vec3_t<ft> kk;
+        if( bdistributed ){
+            auto ip = i + local_1_start_;
+            kk[0] = (real_t(j) - real_t(j > nhalf_[0]) * n_[0]) * kfac_[0];
+            kk[1] = (real_t(ip) - real_t(ip > nhalf_[1]) * n_[1]) * kfac_[1];
+        }else{
+            kk[0] = (real_t(i) - real_t(i > nhalf_[0]) * n_[0]) * kfac_[0];
+            kk[1] = (real_t(j) - real_t(j > nhalf_[1]) * n_[1]) * kfac_[1];
+        }
        kk[2] = (real_t(k) - real_t(k > nhalf_[2]) * n_[2]) * kfac_[2];

        return kk;
    }

+    template <typename ft>
+    vec3_t<ft> get_k(const real_t i, const real_t j, const real_t k) const noexcept
+    {
+        vec3_t<ft> kk;
+        if( bdistributed ){
+            auto ip = i + real_t(local_1_start_);
+            kk[0] = (j - real_t(j > real_t(nhalf_[0])) * n_[0]) * kfac_[0];
+            kk[1] = (ip - real_t(ip > real_t(nhalf_[1])) * n_[1]) * kfac_[1];
+        }else{
+            kk[0] = (real_t(i) - real_t(i > real_t(nhalf_[0])) * n_[0]) * kfac_[0];
+            kk[1] = (real_t(j) - real_t(j > real_t(nhalf_[1])) * n_[1]) * kfac_[1];
+        }
+        kk[2] = (real_t(k) - real_t(k > real_t(nhalf_[2])) * n_[2]) * kfac_[2];
+
+        return kk;
+    }
+
+    std::array<size_t,3> get_k3(const size_t i, const size_t j, const size_t k) const noexcept
+    {
+        return bdistributed? std::array<size_t,3>({j,i+local_1_start_,k}) : std::array<size_t,3>({i,j,k});
+    }
+
+    data_t get_cic( const vec3_t<real_t>& v ) const noexcept
+    {
+        // warning! this doesn't work with MPI
+        vec3_t<real_t> x({std::fmod(v.x/length_[0]+1.0,1.0)*n_[0],
+                        std::fmod(v.y/length_[1]+1.0,1.0)*n_[1],
+                        std::fmod(v.z/length_[2]+1.0,1.0)*n_[2] });
+        size_t ix = static_cast<size_t>(x.x);
+        size_t iy = static_cast<size_t>(x.y);
+        size_t iz = static_cast<size_t>(x.z);
+        real_t dx = x.x-real_t(ix), tx = 1.0-dx;
+        real_t dy = x.y-real_t(iy), ty = 1.0-dy;
+        real_t dz = x.z-real_t(iz), tz = 1.0-dz;
+        size_t ix1 = (ix+1)%n_[0];
+        size_t iy1 = (iy+1)%n_[1];
+        size_t iz1 = (iz+1)%n_[2];
+        data_t val = 0.0;
+        val += this->relem(ix ,iy ,iz ) * tx * ty * tz;
+        val += this->relem(ix ,iy ,iz1) * tx * ty * dz;
+        val += this->relem(ix ,iy1,iz ) * tx * dy * tz;
+        val += this->relem(ix ,iy1,iz1) * tx * dy * dz;
+        val += this->relem(ix1,iy ,iz ) * dx * ty * tz;
+        val += this->relem(ix1,iy ,iz1) * dx * ty * dz;
+        val += this->relem(ix1,iy1,iz ) * dx * dy * tz;
+        val += this->relem(ix1,iy1,iz1) * dx * dy * dz;
+        return val;
+    }
+
+    ccomplex_t get_cic_kspace( const vec3_t<real_t> x ) const noexcept
+    {
+        // warning! this doesn't work with MPI
+        int ix = static_cast<int>(std::floor(x.x));
+        int iy = static_cast<int>(std::floor(x.y));
+        int iz = static_cast<int>(std::floor(x.z));
+        real_t dx = x.x-real_t(ix), tx = 1.0-dx;
+        real_t dy = x.y-real_t(iy), ty = 1.0-dy;
+        real_t dz = x.z-real_t(iz), tz = 1.0-dz;
+        size_t ix1 = (ix+1)%size(0);
+        size_t iy1 = (iy+1)%size(1);
+        size_t iz1 = std::min((iz+1),int(size(2))-1);
+        ccomplex_t val = 0.0;
+        val += this->kelem(ix ,iy ,iz ) * tx * ty * tz;
+        val += this->kelem(ix ,iy ,iz1) * tx * ty * dz;
+        val += this->kelem(ix ,iy1,iz ) * tx * dy * tz;
+        val += this->kelem(ix ,iy1,iz1) * tx * dy * dz;
+        val += this->kelem(ix1,iy ,iz ) * dx * ty * tz;
+        val += this->kelem(ix1,iy ,iz1) * dx * ty * dz;
+        val += this->kelem(ix1,iy1,iz ) * dx * dy * tz;
+        val += this->kelem(ix1,iy1,iz1) * dx * dy * dz;
+        // if( val != val ){
+           //auto k = this->get_k<real_t>(ix,iy,iz);
+           //std::cerr << ix << " " << iy << " " << iz << " " << val << " " <<  this->gradient(0,{ix,iy,iz}) << " " <<  this->gradient(1,{ix,iy,iz}) << " " <<  this->gradient(2,{ix,iy,iz}) << std::endl;
+        // }
+        return val;
+    }
+
    inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
    {
-#if defined(USE_MPI)
-        ijk[0] += local_1_start_;
-        std::swap(ijk[0],ijk[1]);
-#endif
+        if( bdistributed ){
+            ijk[0] += local_1_start_;
+            std::swap(ijk[0],ijk[1]);
+        }
        real_t rgrad = 
            (ijk[idim]!=nhalf_[idim])? (real_t(ijk[idim]) - real_t(ijk[idim] > nhalf_[idim]) * n_[idim]) * kfac_[idim] : 0.0; 
        return ccomplex_t(0.0,rgrad);
    }

-    Grid_FFT<data_t> &operator*=(data_t x)
+    inline real_t laplacian( const std::array<size_t,3>& ijk ) const noexcept
+    {
+        return -this->get_k<real_t>(ijk[0],ijk[1],ijk[2]).norm_squared();
+    }
+
+    grid_fft_t &operator*=(data_t x)
    {
        if (space_ == kspace_id)
        {
@ -274,7 +356,7 @@ public:
        return *this;
    }

-    Grid_FFT<data_t> &operator/=(data_t x)
+    grid_fft_t &operator/=(data_t x)
    {
        if (space_ == kspace_id)
        {
@ -287,7 +369,7 @@ public:
        return *this;
    }

-    Grid_FFT<data_t> &apply_Laplacian(void)
+    grid_fft_t &apply_Laplacian(void)
    {
        this->FourierTransformForward();
        this->apply_function_k_dep([&](auto x, auto k) {
@ -298,7 +380,7 @@ public:
        return *this;
    }

-    Grid_FFT<data_t> &apply_negative_Laplacian(void)
+    grid_fft_t &apply_negative_Laplacian(void)
    {
        this->FourierTransformForward();
        this->apply_function_k_dep([&](auto x, auto k) {
@ -309,7 +391,7 @@ public:
        return *this;
    }

-    Grid_FFT<data_t> &apply_InverseLaplacian(void)
+    grid_fft_t &apply_InverseLaplacian(void)
    {
        this->FourierTransformForward();
        this->apply_function_k_dep([&](auto x, auto k) {
@ -354,11 +436,10 @@ public:
        }
    }

-    double compute_2norm(void)
+    real_t compute_2norm(void) const
    {
        real_t sum1{0.0};
-#pragma omp parallel for reduction(+ \
-                                   : sum1)
+        #pragma omp parallel for reduction(+ : sum1)
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -377,60 +458,60 @@ public:
        return sum1;
    }

-    double std(void)
+    real_t std(void) const
    {
        double sum1{0.0}, sum2{0.0};
        size_t count{0};

-#pragma omp parallel for reduction(+ \
-                                   : sum1, sum2)
+        #pragma omp parallel for reduction(+ : sum1, sum2)
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
            {
                for (size_t k = 0; k < sizes_[2]; ++k)
                {
-                    const auto elem = std::real(this->relem(i, j, k));
-                    sum1 += elem;
-                    sum2 += elem * elem;
+                    const auto elem = (space_==kspace_id)? this->kelem(i, j, k) : this->relem(i, j, k);
+                    sum1 += std::real(elem);
+                    sum2 += std::norm(elem);// * elem;
                }
            }
        }
        count = sizes_[0] * sizes_[1] * sizes_[2];

 #ifdef USE_MPI
-        double globsum1{0.0}, globsum2{0.0};
-        size_t globcount{0};
+        if( bdistributed ){
+            double globsum1{0.0}, globsum2{0.0};
+            size_t globcount{0};

-        MPI_Allreduce(reinterpret_cast<const void *>(&sum1),
-                      reinterpret_cast<void *>(&globsum1),
-                      1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Allreduce(reinterpret_cast<const void *>(&sum1),
+                        reinterpret_cast<void *>(&globsum1),
+                        1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

-        MPI_Allreduce(reinterpret_cast<const void *>(&sum2),
-                      reinterpret_cast<void *>(&globsum2),
-                      1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Allreduce(reinterpret_cast<const void *>(&sum2),
+                        reinterpret_cast<void *>(&globsum2),
+                        1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

-        MPI_Allreduce(reinterpret_cast<const void *>(&count),
-                      reinterpret_cast<void *>(&globcount),
-                      1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Allreduce(reinterpret_cast<const void *>(&count),
+                        reinterpret_cast<void *>(&globcount),
+                        1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);

-        sum1 = globsum1;
-        sum2 = globsum2;
-        count = globcount;
+            sum1 = globsum1;
+            sum2 = globsum2;
+            count = globcount;
+        }
 #endif
        sum1 /= count;
        sum2 /= count;

-        return std::sqrt(sum2 - sum1 * sum1);
+        return real_t(std::sqrt(sum2 - sum1 * sum1));
    }

-    double mean(void)
+    real_t mean(void) const
    {
        double sum1{0.0};
        size_t count{0};

-#pragma omp parallel for reduction(+ \
-                                   : sum1)
+        #pragma omp parallel for reduction(+ : sum1)
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -445,32 +526,34 @@ public:
        count = sizes_[0] * sizes_[1] * sizes_[2];

 #ifdef USE_MPI
-        double globsum1{0.0};
-        size_t globcount{0};
+        if( bdistributed ){
+            double globsum1{0.0};
+            size_t globcount{0};

-        MPI_Allreduce(reinterpret_cast<const void *>(&sum1),
-                      reinterpret_cast<void *>(&globsum1),
-                      1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Allreduce(reinterpret_cast<const void *>(&sum1),
+                        reinterpret_cast<void *>(&globsum1),
+                        1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

-        MPI_Allreduce(reinterpret_cast<const void *>(&count),
-                      reinterpret_cast<void *>(&globcount),
-                      1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Allreduce(reinterpret_cast<const void *>(&count),
+                        reinterpret_cast<void *>(&globcount),
+                        1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);

-        sum1 = globsum1;
-        count = globcount;
+            sum1 = globsum1;
+            count = globcount;
+        }
 #endif

        sum1 /= count;

-        return sum1;
+        return real_t(sum1);
    }

    template <typename functional, typename grid_t>
    void assign_function_of_grids_r(const functional &f, const grid_t &g)
    {
-        assert(g.size(0) == size(0) && g.size(1) == size(1)); // && g.size(2) == size(2) );
+        assert(g.size(0) == size(0) && g.size(1) == size(1)); 

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -489,10 +572,10 @@ public:
    template <typename functional, typename grid1_t, typename grid2_t>
    void assign_function_of_grids_r(const functional &f, const grid1_t &g1, const grid2_t &g2)
    {
-        assert(g1.size(0) == size(0) && g1.size(1) == size(1)); // && g1.size(2) == size(2));
-        assert(g2.size(0) == size(0) && g2.size(1) == size(1)); // && g2.size(2) == size(2));
+        assert(g1.size(0) == size(0) && g1.size(1) == size(1)); 
+        assert(g2.size(0) == size(0) && g2.size(1) == size(1)); 

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -518,7 +601,7 @@ public:
        assert(g2.size(0) == size(0) && g2.size(1) == size(1)); // && g2.size(2) == size(2));
        assert(g3.size(0) == size(0) && g3.size(1) == size(1)); // && g3.size(2) == size(2));

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -543,7 +626,7 @@ public:
    {
        assert(g.size(0) == size(0) && g.size(1) == size(1)); // && g.size(2) == size(2) );

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -565,7 +648,7 @@ public:
        assert(g1.size(0) == size(0) && g1.size(1) == size(1)); // && g.size(2) == size(2) );
        assert(g2.size(0) == size(0) && g2.size(1) == size(1)); // && g.size(2) == size(2) );

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -582,18 +665,39 @@ public:
        }
    }

-    template <typename functional, typename grid1_t, typename grid2_t>
-    void assign_function_of_grids_kdep(const functional &f, const grid1_t &g1, const grid2_t &g2)
+    template <typename functional, typename grid_t>
+    void assign_function_of_grids_kdep(const functional &f, const grid_t &g)
    {
-        assert(g1.size(0) == size(0) && g1.size(1) == size(1)); // && g.size(2) == size(2) );
-        assert(g2.size(0) == size(0) && g2.size(1) == size(1)); // && g.size(2) == size(2) );
+        assert(g.size(0) == size(0) && g.size(1) == size(1)); // && g.size(2) == size(2) );

-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
            {
                for (size_t k = 0; k < sizes_[2]; ++k)
+                {
+                    auto &elem = this->kelem(i, j, k);
+                    const auto &elemg = g.kelem(i, j, k);
+
+                    elem = f(this->get_k<real_t>(i, j, k), elemg);
+                }
+            }
+        }
+    }
+
+    template <typename functional, typename grid1_t, typename grid2_t>
+    void assign_function_of_grids_kdep(const functional &f, const grid1_t &g1, const grid2_t &g2)
+    {
+        assert(g1.size(0) == size(0) && g1.size(1) == size(1) && g1.size(2) == size(2) );
+        assert(g2.size(0) == size(0) && g2.size(1) == size(1) && g2.size(2) == size(2) );
+
+        #pragma omp parallel for
+        for (size_t i = 0; i < size(0); ++i)
+        {
+            for (size_t j = 0; j < size(1); ++j)
+            {
+                for (size_t k = 0; k < size(2); ++k)
                {
                    auto &elem = this->kelem(i, j, k);
                    const auto &elemg1 = g1.kelem(i, j, k);
@ -608,7 +712,7 @@ public:
    template <typename functional>
    void apply_function_k_dep(const functional &f)
    {
-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -625,7 +729,7 @@ public:
    template <typename functional>
    void apply_function_r_dep(const functional &f)
    {
-#pragma omp parallel for
+        #pragma omp parallel for
        for (size_t i = 0; i < sizes_[0]; ++i)
        {
            for (size_t j = 0; j < sizes_[1]; ++j)
@ -649,48 +753,31 @@ public:

    void Write_to_HDF5(std::string fname, std::string datasetname) const;

+    void Read_from_HDF5( std::string fname, std::string datasetname );
+
    void Write_PowerSpectrum(std::string ofname);

    void Compute_PowerSpectrum(std::vector<double> &bin_k, std::vector<double> &bin_P, std::vector<double> &bin_eP, std::vector<size_t> &bin_count);

    void Write_PDF(std::string ofname, int nbins = 1000, double scale = 1.0, double rhomin = 1e-3, double rhomax = 1e3);

-    // void stagger_field(void)
-    // {
-    //     FourierTransformForward();
-    //     apply_function_k_dep([&](auto x, auto k) -> ccomplex_t {
-    //         real_t shift = k[0] * get_dx()[0] + k[1] * get_dx()[1] + k[2] * get_dx()[2];
-    //         return x * std::exp(ccomplex_t(0.0, 0.5 * shift));
-    //     });
-    //     FourierTransformBackward();
-    // }
-
-    void shift_field( double sx, double sy, double sz )
+    void shift_field( const vec3_t<real_t>& s, bool transform_back=true )
    {
        FourierTransformForward();
        apply_function_k_dep([&](auto x, auto k) -> ccomplex_t {
-#ifdef WITH_MPI
-            real_t shift = sy * k[0] * get_dx()[0] + sx * k[1] * get_dx()[1] + sz * k[2] * get_dx()[2];
-#else
-            real_t shift = sx * k[0] * get_dx()[0] + sy * k[1] * get_dx()[1] + sz * k[2] * get_dx()[2];
-#endif
+            real_t shift = s.x * k[0] * get_dx()[0] + s.y * k[1] * get_dx()[1] + s.z * k[2] * get_dx()[2];
            return x * std::exp(ccomplex_t(0.0, shift));
        });
-        FourierTransformBackward();
-    }
-
-    void stagger_field(void)
-    {
-        this->shift_field( 0.5, 0.5, 0.5 );
+        if( transform_back ){
+            FourierTransformBackward();
+        }
    }

    void zero_DC_mode(void)
    {
        if (space_ == kspace_id)
        {
-#ifdef USE_MPI
-            if (CONFIG::MPI_task_rank == 0)
-#endif
+            if (CONFIG::MPI_task_rank == 0 || !bdistributed )
                cdata_[0] = (data_t)0.0;
        }
        else
@ -707,12 +794,14 @@ public:
                    }
                }
            }
+            if( bdistributed ){
 #if defined(USE_MPI)
-            data_t glob_sum = 0.0;
-            MPI_Allreduce(reinterpret_cast<void *>(&sum), reinterpret_cast<void *>(&glob_sum),
-                          1, GetMPIDatatype<data_t>(), MPI_SUM, MPI_COMM_WORLD);
-            sum = glob_sum;
+                data_t glob_sum = 0.0;
+                MPI_Allreduce(reinterpret_cast<void *>(&sum), reinterpret_cast<void *>(&glob_sum),
+                            1, MPI::get_datatype<data_t>(), MPI_SUM, MPI_COMM_WORLD);
+                sum = glob_sum;
 #endif
+            }
            sum /= sizes_[0] * sizes_[1] * sizes_[2];

 #pragma omp parallel for
--- a/include/grid_interpolate.hh
+++ b/include/grid_interpolate.hh
@ -0,0 +1,191 @@
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include <general.hh>
+
+#include <math/vec3.hh>
+
+template <int interp_order, typename grid_t>
+struct grid_interpolate
+{
+  using data_t = typename grid_t::data_t;
+  using vec3 = std::array<real_t, 3>;
+
+  static constexpr bool is_distributed_trait = grid_t::is_distributed_trait;
+  static constexpr int interpolation_order = interp_order;
+
+  std::vector<data_t> boundary_;
+  std::vector<int> local0starts_;
+  const grid_t &gridref;
+  size_t nx_, ny_, nz_;
+
+  explicit grid_interpolate(const grid_t &g)
+      : gridref(g), nx_(g.n_[0]), ny_(g.n_[1]), nz_(g.n_[2])
+  {
+    static_assert(interpolation_order >= 0 && interpolation_order <= 2, "Interpolation order needs to be 0 (NGP), 1 (CIC), or 2 (TSC).");
+
+    if (is_distributed_trait)
+    {
+      update_ghosts( g );
+    }
+  }
+
+  void update_ghosts( const grid_t &g )
+  {
+  #if defined(USE_MPI)
+
+    int local_0_start = int(gridref.local_0_start_);
+    local0starts_.assign(MPI::get_size(), 0);
+
+    MPI_Allgather(&local_0_start, 1, MPI_INT, &local0starts_[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+    //... exchange boundary
+    size_t nx = interpolation_order + 1;
+    size_t ny = g.n_[1];
+    size_t nz = g.n_[2];
+
+    boundary_.assign(nx * ny * nz, data_t{0.0});
+
+    for (size_t i = 0; i < nx; ++i)
+    {
+      for (size_t j = 0; j < ny; ++j)
+      {
+        for (size_t k = 0; k < nz; ++k)
+        {
+          boundary_[(i * ny + j) * nz + k] = g.relem(i, j, k);
+        }
+      }
+    }
+
+    int sendto = (MPI::get_rank() + MPI::get_size() - 1) % MPI::get_size();
+    int recvfrom = (MPI::get_rank() + MPI::get_size() + 1) % MPI::get_size();
+
+    MPI_Status status;
+    status.MPI_ERROR = MPI_SUCCESS;
+
+    int err = MPI_Sendrecv_replace(&boundary_[0], nx * ny * nz, MPI::get_datatype<data_t>(), sendto,
+                          MPI::get_rank() + 1000, recvfrom, recvfrom + 1000, MPI_COMM_WORLD, &status);
+
+    if( err != MPI_SUCCESS ){
+      char errstr[256]; int errlen=256;
+      MPI_Error_string(err, errstr, &errlen ); 
+      music::elog << "MPI_ERROR #" << err << " : " << errstr << std::endl;
+    }
+#endif
+  }
+
+  data_t get_ngp_at(const std::array<real_t, 3> &pos, std::vector<data_t> &val) const noexcept
+  {
+    size_t ix = static_cast<size_t>(pos[0]);
+    size_t iy = static_cast<size_t>(pos[1]);
+    size_t iz = static_cast<size_t>(pos[2]);
+    return gridref.relem(ix - gridref.local_0_start_, iy, iz);
+  }
+
+  data_t get_cic_at(const std::array<real_t, 3> &pos) const noexcept
+  {
+    size_t ix = static_cast<size_t>(pos[0]);
+    size_t iy = static_cast<size_t>(pos[1]);
+    size_t iz = static_cast<size_t>(pos[2]);
+    real_t dx = pos[0] - real_t(ix), tx = 1.0 - dx;
+    real_t dy = pos[1] - real_t(iy), ty = 1.0 - dy;
+    real_t dz = pos[2] - real_t(iz), tz = 1.0 - dz;
+    size_t iy1 = (iy + 1) % ny_;
+    size_t iz1 = (iz + 1) % nz_;
+
+    data_t val{0.0};
+    
+    if( is_distributed_trait ){
+      ptrdiff_t localix = ix-gridref.local_0_start_;
+      val += gridref.relem(localix, iy, iz) * tx * ty * tz;
+      val += gridref.relem(localix, iy, iz1) * tx * ty * dz;
+      val += gridref.relem(localix, iy1, iz) * tx * dy * tz;
+      val += gridref.relem(localix, iy1, iz1) * tx * dy * dz;
+
+      if( localix+1 >= gridref.local_0_size_ ){
+        size_t localix1 = localix+1 - gridref.local_0_size_;
+        val += boundary_[(localix1*ny_+iy)*nz_+iz] * dx * ty * tz;
+        val += boundary_[(localix1*ny_+iy)*nz_+iz1] * dx * ty * dz;
+        val += boundary_[(localix1*ny_+iy1)*nz_+iz] * dx * dy * tz;
+        val += boundary_[(localix1*ny_+iy1)*nz_+iz1] * dx * dy * dz;
+      }else{
+        size_t localix1 = localix+1;
+        val += gridref.relem(localix1, iy, iz) * dx * ty * tz;
+        val += gridref.relem(localix1, iy, iz1) * dx * ty * dz;
+        val += gridref.relem(localix1, iy1, iz) * dx * dy * tz;
+        val += gridref.relem(localix1, iy1, iz1) * dx * dy * dz;
+      }
+    }else{
+      size_t ix1 = (ix + 1) % nx_;
+      val += gridref.relem(ix, iy, iz) * tx * ty * tz;
+      val += gridref.relem(ix, iy, iz1) * tx * ty * dz;
+      val += gridref.relem(ix, iy1, iz) * tx * dy * tz;
+      val += gridref.relem(ix, iy1, iz1) * tx * dy * dz;
+      val += gridref.relem(ix1, iy, iz) * dx * ty * tz;
+      val += gridref.relem(ix1, iy, iz1) * dx * ty * dz;
+      val += gridref.relem(ix1, iy1, iz) * dx * dy * tz;
+      val += gridref.relem(ix1, iy1, iz1) * dx * dy * dz;
+    }
+    return val;
+  }
+
+  // data_t get_tsc_at(const std::array<real_t, 3> &pos, std::vector<data_t> &val) const
+  // {
+  // }
+
+  int get_task(const vec3 &x) const noexcept
+  {
+    const auto it = std::upper_bound(local0starts_.begin(), local0starts_.end(), int(x[0]));
+    return std::distance(local0starts_.begin(), it)-1;
+  }
+
+  void domain_decompose_pos(std::vector<vec3> &pos) const noexcept
+  {
+    if (is_distributed_trait)
+    {
+#if defined(USE_MPI)
+      std::sort(pos.begin(), pos.end(), [&](auto x1, auto x2) { return get_task(x1) < get_task(x2); });
+      std::vector<int> sendcounts(MPI::get_size(), 0), sendoffsets(MPI::get_size(), 0);
+      std::vector<int> recvcounts(MPI::get_size(), 0), recvoffsets(MPI::get_size(), 0);
+      for (auto x : pos)
+      {
+        sendcounts[get_task(x)] += 3;
+      }
+
+      MPI_Alltoall(&sendcounts[0], 1, MPI_INT, &recvcounts[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+      size_t tot_receive = recvcounts[0], tot_send = sendcounts[0];
+      for (int i = 1; i < MPI::get_size(); ++i)
+      {
+        sendoffsets[i] = sendcounts[i - 1] + sendoffsets[i - 1];
+        recvoffsets[i] = recvcounts[i - 1] + recvoffsets[i - 1];
+        tot_receive += recvcounts[i];
+        tot_send += sendcounts[i];
+      }
+
+      std::vector<vec3> recvbuf(tot_receive/3,{0.,0.,0.});
+
+      MPI_Alltoallv(&pos[0], &sendcounts[0], &sendoffsets[0], MPI::get_datatype<real_t>(),
+                    &recvbuf[0], &recvcounts[0], &recvoffsets[0], MPI::get_datatype<real_t>(), MPI_COMM_WORLD);
+
+      pos.swap( recvbuf );
+#endif
+    }
+  }
+
+  ccomplex_t compensation_kernel( const vec3_t<real_t>& k ) const noexcept
+  {
+    auto sinc = []( real_t x ){ return (std::abs(x)>1e-10)? std::sin(x)/x : 1.0; };
+    real_t dfx = sinc(0.5*M_PI*k[0]/gridref.kny_[0]);
+    real_t dfy = sinc(0.5*M_PI*k[1]/gridref.kny_[1]);
+    real_t dfz = sinc(0.5*M_PI*k[2]/gridref.kny_[2]);
+    real_t del = std::pow(dfx*dfy*dfz,1+interpolation_order);
+
+    real_t shift = 0.5 * k[0] * gridref.get_dx()[0] + 0.5 * k[1] * gridref.get_dx()[1] + 0.5 * k[2] * gridref.get_dx()[2];
+
+    return std::exp(ccomplex_t(0.0, shift)) / del;
+  }
+
+};
--- a/include/ic_generator.hh
+++ b/include/ic_generator.hh
@ -9,12 +9,12 @@

 namespace ic_generator{

-    int Run( ConfigFile& the_config );
+    int Run( config_file& the_config );
    
-    int Initialise( ConfigFile& the_config );
+    int Initialise( config_file& the_config );

    extern std::unique_ptr<RNG_plugin> the_random_number_generator;
    extern std::unique_ptr<output_plugin> the_output_plugin;
-    extern std::unique_ptr<CosmologyCalculator>  the_cosmo_calc;
+    extern std::unique_ptr<cosmology::calculator>  the_cosmo_calc;

 }
--- a/include/logger.hh
+++ b/include/logger.hh
@ -6,35 +6,35 @@
 #include <fstream>
 #include <iostream>

-namespace csoca {
+namespace music {

-enum LogLevel : int {
-  Off     = 0,
-  Fatal   = 1,
-  Error   = 2,
-  Warning = 3,
-  Info    = 4,
-  Debug   = 5
+enum log_level : int {
+  off     = 0,
+  fatal   = 1,
+  error   = 2,
+  warning = 3,
+  info    = 4,
+  debug   = 5
 };

-class Logger {
+class logger {
 private:
-  static LogLevel log_level_;
+  static log_level log_level_;
  static std::ofstream output_file_;

 public:
-  Logger()  = default;
-  ~Logger() = default;
+  logger()  = default;
+  ~logger() = default;

-  static void SetLevel(const LogLevel &level);
-  static LogLevel GetLevel();
+  static void set_level(const log_level &level);
+  static log_level get_level();

-  static void SetOutput(const std::string filename);
-  static void UnsetOutput();
+  static void set_output(const std::string filename);
+  static void unset_output();

-  static std::ofstream &GetOutput();
+  static std::ofstream &get_output();

-  template <typename T> Logger &operator<<(const T &item) {
+  template <typename T> logger &operator<<(const T &item) {
    std::cout << item;
    if (output_file_.is_open()) {
      output_file_ << item;
@ -42,7 +42,7 @@ public:
    return *this;
  }

-  Logger &operator<<(std::ostream &(*fp)(std::ostream &)) {
+  logger &operator<<(std::ostream &(*fp)(std::ostream &)) {
    std::cout << fp;
    if (output_file_.is_open()) {
      output_file_ << fp;
@ -51,32 +51,32 @@ public:
  }
 };

-class LogStream {
+class log_stream {
 private:
-  Logger &logger_;
-  LogLevel stream_level_;
+  logger &logger_;
+  log_level stream_level_;
  std::string line_prefix_, line_postfix_;

  bool newline;

 public:
-  LogStream(Logger &logger, const LogLevel &level)
+  log_stream(logger &logger, const log_level &level)
    : logger_(logger), stream_level_(level), newline(true) {
    switch (stream_level_) {
-      case LogLevel::Fatal:
+      case log_level::fatal:
        line_prefix_ = "\033[31mFatal : ";
        break;
-      case LogLevel::Error:
+      case log_level::error:
        line_prefix_ = "\033[31mError : ";
        break;
-      case LogLevel::Warning:
+      case log_level::warning:
        line_prefix_ = "\033[33mWarning : ";
        break;
-      case LogLevel::Info:
+      case log_level::info:
        //line_prefix_ = " | Info    | ";
        line_prefix_ = " \033[0m";
        break;
-      case LogLevel::Debug:
+      case log_level::debug:
        line_prefix_ = "Debug : \033[0m";
        break;
      default:
@ -85,14 +85,14 @@ public:
    }
    line_postfix_ = "\033[0m";
  }
-  ~LogStream() = default;
+  ~log_stream() = default;

  inline std::string GetPrefix() const {
    return line_prefix_;
  }

-  template <typename T> LogStream &operator<<(const T &item) {
-    if (Logger::GetLevel() >= stream_level_) {
+  template <typename T> log_stream &operator<<(const T &item) {
+    if (logger::get_level() >= stream_level_) {
      if (newline) {
        logger_ << line_prefix_;
        newline = false;
@ -102,8 +102,8 @@ public:
    return *this;
  }

-  LogStream &operator<<(std::ostream &(*fp)(std::ostream &)) {
-    if (Logger::GetLevel() >= stream_level_) {
+  log_stream &operator<<(std::ostream &(*fp)(std::ostream &)) {
+    if (logger::get_level() >= stream_level_) {
      logger_ << fp;
      logger_ << line_postfix_;
      newline = true;
@ -125,11 +125,11 @@ public:
 };

 // global instantiations for different levels
-extern Logger glogger;
-extern LogStream flog;
-extern LogStream elog;
-extern LogStream wlog;
-extern LogStream ilog;
-extern LogStream dlog;
+extern logger glogger;
+extern log_stream flog;
+extern log_stream elog;
+extern log_stream wlog;
+extern log_stream ilog;
+extern log_stream dlog;

-} // namespace csoca
+} // namespace music
--- a/include/math/interpolate.hh
+++ b/include/math/interpolate.hh
@ -0,0 +1,68 @@
+#pragma once
+
+#include <vector>
+#include <cassert>
+#include <gsl/gsl_spline.h>
+#include <gsl/gsl_errno.h>
+
+template <bool logx, bool logy, bool periodic>
+class interpolated_function_1d
+{
+
+private:
+  bool isinit_;
+  std::vector<double> data_x_, data_y_;
+  gsl_interp_accel *gsl_ia_;
+  gsl_spline *gsl_sp_;
+
+  void deallocate()
+  {
+    gsl_spline_free(gsl_sp_);
+    gsl_interp_accel_free(gsl_ia_);
+  }
+
+public:
+  interpolated_function_1d(const interpolated_function_1d &) = delete;
+
+  interpolated_function_1d() : isinit_(false){}
+
+  interpolated_function_1d(const std::vector<double> &data_x, const std::vector<double> &data_y)
+  : isinit_(false)
+  {
+    this->set_data( data_x, data_y );
+  }
+
+  ~interpolated_function_1d()
+  {
+    if (isinit_) this->deallocate();
+  }
+
+  void set_data(const std::vector<double> &data_x, const std::vector<double> &data_y)
+  {
+    data_x_ = data_x;
+    data_y_ = data_y;
+    
+    assert(data_x_.size() == data_y_.size());
+    assert(data_x_.size() > 5);
+    assert(!(logx & periodic));
+
+    if (logx) for (auto &d : data_x_) d = std::log(d);
+    if (logy) for (auto &d : data_y_) d = std::log(d);
+
+    if (isinit_) this->deallocate();
+
+    gsl_ia_ = gsl_interp_accel_alloc();
+    gsl_sp_ = gsl_spline_alloc(periodic ? gsl_interp_cspline_periodic : gsl_interp_cspline, data_x_.size());
+    gsl_spline_init(gsl_sp_, &data_x_[0], &data_y_[0], data_x_.size());
+
+    isinit_ = true;
+  }
+
+  double operator()(double x) const noexcept
+  {
+    assert( isinit_ && !(logx&&x<=0.0) );
+    double xa = logx ? std::log(x) : x;
+    double y(gsl_spline_eval(gsl_sp_, xa, gsl_ia_));
+    return logy ? std::exp(y) : y;
+  }
+};
--- a/include/math/mat3.hh
+++ b/include/math/mat3.hh
@ -0,0 +1,146 @@
+#include <gsl/gsl_math.h>
+#include <gsl/gsl_eigen.h>
+
+#include <math/vec3.hh>
+
+template<typename T>
+class mat3_t{
+protected:
+    std::array<T,9> data_;
+    gsl_matrix_view m_;
+    gsl_vector *eval_;
+    gsl_matrix *evec_;
+	gsl_eigen_symmv_workspace * wsp_;
+    bool bdid_alloc_gsl_;
+						
+    void init_gsl(){
+        // allocate memory for GSL operations if we haven't done so yet
+        if( !bdid_alloc_gsl_ )
+        {
+            m_ = gsl_matrix_view_array (&data_[0], 3, 3);
+            eval_ = gsl_vector_alloc (3);
+            evec_ = gsl_matrix_alloc (3, 3);
+            wsp_ = gsl_eigen_symmv_alloc (3);
+            bdid_alloc_gsl_ = true;
+        }
+    }
+
+    void free_gsl(){
+        // free memory for GSL operations if it was allocated
+        if( bdid_alloc_gsl_ )
+        {
+            gsl_eigen_symmv_free (wsp_);
+            gsl_vector_free (eval_);
+            gsl_matrix_free (evec_);
+        }
+    }
+
+public:
+
+    mat3_t()
+    : bdid_alloc_gsl_(false) 
+    {}
+
+    //! copy constructor
+    mat3_t( const mat3_t<T> &m)
+    : data_(m.data_), bdid_alloc_gsl_(false) 
+    {}
+    
+    //! move constructor
+    mat3_t( mat3_t<T> &&m)
+    : data_(std::move(m.data_)), bdid_alloc_gsl_(false) 
+    {}
+
+    //! construct mat3_t from initializer list
+    template<typename ...E>
+    mat3_t(E&&...e) 
+    : data_{{std::forward<E>(e)...}}, bdid_alloc_gsl_(false)
+    {}
+
+    mat3_t<T>& operator=(const mat3_t<T>& m) noexcept{
+        data_ = m.data_;
+        return *this;
+    }
+
+    mat3_t<T>& operator=(const mat3_t<T>&& m) noexcept{
+        data_ = std::move(m.data_);
+        return *this;
+    }
+
+    //! destructor
+    ~mat3_t(){
+        this->free_gsl();
+    }
+    
+    //! bracket index access to vector components
+    T &operator[](size_t i) noexcept { return data_[i];}
+    
+    //! const bracket index access to vector components
+    const T &operator[](size_t i) const noexcept { return data_[i]; }
+
+    //! matrix 2d index access
+    T &operator()(size_t i, size_t j) noexcept { return data_[3*i+j]; }
+
+    //! const matrix 2d index access
+    const T &operator()(size_t i, size_t j) const noexcept { return data_[3*i+j]; }
+
+    //! in-place addition
+    mat3_t<T>& operator+=( const mat3_t<T>& rhs ) noexcept{
+        for (size_t i = 0; i < 9; ++i) {
+           (*this)[i] += rhs[i];
+        }
+        return *this;
+    }
+
+    //! in-place subtraction
+    mat3_t<T>& operator-=( const mat3_t<T>& rhs ) noexcept{
+        for (size_t i = 0; i < 9; ++i) {
+           (*this)[i] -= rhs[i];
+        }
+        return *this;
+    }
+
+    void zero() noexcept{
+        for (size_t i = 0; i < 9; ++i) data_[i]=0;
+    }
+
+    void eigen( vec3_t<T>& evals, vec3_t<T>& evec1, vec3_t<T>& evec2, vec3_t<T>& evec3_t )
+    {
+        this->init_gsl();
+
+        gsl_eigen_symmv (&m_.matrix, eval_, evec_, wsp_);
+        gsl_eigen_symmv_sort (eval_, evec_, GSL_EIGEN_SORT_VAL_ASC);
+
+        for( int i=0; i<3; ++i ){
+            evals[i] = gsl_vector_get( eval_, i );
+            evec1[i] = gsl_matrix_get( evec_, i, 0 );
+            evec2[i] = gsl_matrix_get( evec_, i, 1 );
+            evec3_t[i] = gsl_matrix_get( evec_, i, 2 );
+        }
+    }
+};
+
+template<typename T>
+constexpr const mat3_t<T> operator+(const mat3_t<T> &lhs, const mat3_t<T> &rhs) noexcept
+{
+    mat3_t<T> result;
+    for (size_t i = 0; i < 9; ++i) {
+        result[i] = lhs[i] + rhs[i];
+    }
+    return result;
+}
+
+// matrix - vector multiplication
+template<typename T>
+inline vec3_t<T> operator*( const mat3_t<T> &A, const vec3_t<T> &v ) noexcept
+{
+    vec3_t<T> result;
+    for( int mu=0; mu<3; ++mu ){
+        result[mu] = 0.0;
+        for( int nu=0; nu<3; ++nu ){
+            result[mu] += A(mu,nu)*v[nu];
+        }
+    }
+    return result;
+}
+
--- a/include/math/ode_integrate.hh
+++ b/include/math/ode_integrate.hh
@ -0,0 +1,103 @@
+#pragma once
+/*******************************************************************************\
+ odetools.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    06/2019 - Oliver Hahn - first implementation
+\*******************************************************************************/
+
+namespace ode_integrate
+{
+
+// simple Runge-Kutta 4th order step without error estimate
+template <typename vector_t, typename function_t>
+inline void rk4_step(double h, double &t, vector_t &y, function_t f)
+{
+    vector_t k1(h * f(t, y));
+    vector_t k2(h * f(t + h / 2, y + k1 / 2));
+    vector_t k3(h * f(t + h / 2, y + k2 / 2));
+    vector_t k4(h * f(t + h, y + k3));
+    y += (k1 + 2 * k2 + 2 * k3 + k4) / 6;
+    t += h;
+}
+
+// Cash-Karp modified Runge-Kutta scheme, 5th order with 4th order error estimate
+// see Press & Teukolsky (1992): "Adaptive Stepsize Runge-Kutta Integration"
+// in Computers in Physics 6, 188 (1992); doi: 10.1063/1.4823060
+template <typename vector_t, typename function_t>
+inline vector_t ckrk5_step(double h, double &t, vector_t &y, function_t f)
+{
+  static constexpr double
+      a2 = 0.20,
+      a3 = 0.30, a4 = 0.60, a5 = 1.0, a6 = 0.8750,
+      b21 = 0.20,
+      b31 = 3.0 / 40.0, b32 = 9.0 / 40.0,
+      b41 = 0.30, b42 = -0.90, b43 = 1.20,
+      b51 = -11.0 / 54.0, b52 = 2.50, b53 = -70.0 / 27.0, b54 = 35.0 / 27.0,
+      b61 = 1631.0 / 55296.0, b62 = 175.0 / 512.0, b63 = 575.0 / 13824.0, b64 = 44275.0 / 110592.0, b65 = 253.0 / 4096.0,
+      c1 = 37.0 / 378.0, c3 = 250.0 / 621.0, c4 = 125.0 / 594.0, c6 = 512.0 / 1771.0,
+      dc1 = c1 - 2825.0 / 27648.0, dc3 = c3 - 18575.0 / 48384.0,
+      dc4 = c4 - 13525.0 / 55296.0, dc5 = -277.0 / 14336.0, dc6 = c6 - 0.250;
+
+  vector_t k1(h * f(t, y));
+  vector_t k2(h * f(t + a2 * h, y + b21 * k1));
+  vector_t k3(h * f(t + a3 * h, y + b31 * k1 + b32 * k2));
+  vector_t k4(h * f(t + a4 * h, y + b41 * k1 + b42 * k2 + b43 * k3));
+  vector_t k5(h * f(t + a5 * h, y + b51 * k1 + b52 * k2 + b53 * k3 + b54 * k4));
+  vector_t k6(h * f(t + a6 * h, y + b61 * k1 + b62 * k2 + b63 * k3 + b64 * k4 + b65 * k5));
+
+  y += c1 * k1 + c3 * k3 + c4 * k4 + c6 * k6;
+
+  return dc1 * k1 + dc3 * k3 + dc4 * k4 + dc5 * k5 + dc6 * k6;
+}
+
+// Adaptive step-size quality-controlled routine for ckrk5_step, see
+// Press & Teukolsky (1992): "Adaptive Stepsize Runge-Kutta Integration"
+// in Computers in Physics 6, 188 (1992); doi: 10.1063/1.4823060
+template <typename vector_t, typename function_t>
+inline void rk_step_qs(double htry, double &t, vector_t &y, vector_t &yscale, function_t f, double eps, double &hdid, double &hnext)
+{
+  static constexpr double SAFETY{0.9};
+  static constexpr double PSHRNK{-0.25};
+  static constexpr double PGROW{-0.2};
+  static constexpr double ERRCON{1.89e-4};
+
+  auto h(htry);
+  vector_t ytemp(y);
+  vector_t yerr;
+  double errmax;
+
+do_ckrk5trialstep:
+  yerr = ckrk5_step(h, t, ytemp, f);
+  errmax = 0.0;
+  for (size_t i = 0; i < yerr.size(); ++i)
+  {
+    errmax = std::max(errmax, std::abs(yerr[i] / yscale[i]));
+  }
+  errmax = errmax / eps;
+  if (errmax > 1.0)
+  {
+    h *= std::max(0.1, SAFETY*std::pow(errmax, PSHRNK));
+    if (t + h == t)
+    {
+      std::cerr << "stepsize underflow in rkqs" << std::endl;
+      abort();
+    }
+    goto do_ckrk5trialstep;
+  }
+  else
+  {
+    if( errmax > ERRCON ){
+      hnext = h * SAFETY * std::pow(errmax, PGROW);
+    }else{
+      hnext = 5*h;
+    }
+    hdid = h;
+    t += h;
+    y = ytemp;
+  }
+}
+
+
+} // namespace ode_integrate
--- a/include/math/vec3.hh
+++ b/include/math/vec3.hh
@ -0,0 +1,118 @@
+/*******************************************************************\
+ vec3_t.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    06/2019 - Oliver Hahn - first implementation
+\*******************************************************************/
+#pragma once
+
+//! implements a simple class of 3-vectors of arbitrary scalar type
+template< typename T >
+class vec3_t{
+private:
+    //! holds the data
+    std::array<T,3> data_;
+    
+public: 
+    //! expose access to elements via references
+    T &x,&y,&z;
+
+    //! empty constructor
+    vec3_t()
+    : x(data_[0]),y(data_[1]),z(data_[2]){}
+
+    //! copy constructor
+    vec3_t( const vec3_t<T> &v)
+    : data_(v.data_), x(data_[0]),y(data_[1]),z(data_[2]){}
+
+    //! copy constructor for non-const reference, needed to avoid variadic template being called for non-const reference
+    vec3_t( vec3_t<T>& v)
+    : data_(v.data_), x(data_[0]),y(data_[1]),z(data_[2]){}
+
+    //! move constructor
+    vec3_t( vec3_t<T> &&v)
+    : data_(std::move(v.data_)), x(data_[0]), y(data_[1]), z(data_[2]){}
+
+    //! construct vec3_t from initializer list
+    template<typename ...E>
+    vec3_t(E&&...e) 
+    : data_{{std::forward<E>(e)...}}, x{data_[0]}, y{data_[1]}, z{data_[2]}
+    {}
+    // vec3_t( T a, T b, T c ) 
+    // : data_{{a,b,c}}, x(data_[0]), y(data_[1]), z(data_[2]){}
+    
+    //! bracket index access to vector components
+    T &operator[](size_t i) noexcept{ return data_[i];}
+    
+    //! const bracket index access to vector components
+    const T &operator[](size_t i) const noexcept { return data_[i]; }
+
+    // assignment operator
+    vec3_t<T>& operator=( const vec3_t<T>& v ) noexcept { data_=v.data_; return *this; }
+
+    //! implementation of summation of vec3_t
+    vec3_t<T> operator+( const vec3_t<T>& v ) const noexcept{ return vec3_t<T>({x+v.x,y+v.y,z+v.z}); }
+
+    //! implementation of difference of vec3_t
+    vec3_t<T> operator-( const vec3_t<T>& v ) const noexcept{ return vec3_t<T>({x-v.x,y-v.y,z-v.z}); }
+
+    //! implementation of unary negative
+    vec3_t<T> operator-() const noexcept{ return vec3_t<T>({-x,-y,-z}); }
+
+    //! implementation of scalar multiplication
+    vec3_t<T> operator*( T s ) const noexcept{ return vec3_t<T>({x*s,y*s,z*s}); }
+
+    //! implementation of scalar division
+    vec3_t<T> operator/( T s ) const noexcept{ return vec3_t<T>({x/s,y/s,z/s}); }
+
+    //! implementation of += operator
+    vec3_t<T>& operator+=( const vec3_t<T>& v ) noexcept{ x+=v.x; y+=v.y; z+=v.z; return *this; }
+
+    //! implementation of -= operator
+    vec3_t<T>& operator-=( const vec3_t<T>& v ) noexcept{ x-=v.x; y-=v.y; z-=v.z; return *this; }
+
+    //! multiply with scalar
+    vec3_t<T>& operator*=( T s ) noexcept{ x*=s; y*=s; z*=s; return *this; }
+    
+    //! divide by scalar
+    vec3_t<T>& operator/=( T s ) noexcept{ x/=s; y/=s; z/=s; return *this; }
+
+    //! compute dot product with another vector
+    T dot(const vec3_t<T> &a) const noexcept
+    {
+        return data_[0] * a.data_[0] + data_[1] * a.data_[1] + data_[2] * a.data_[2];
+    }
+    
+    //! returns 2-norm squared of vector
+    T norm_squared(void) const noexcept { return this->dot(*this); }
+
+    //! returns 2-norm of vector
+    T norm(void) const noexcept { return std::sqrt( this->norm_squared() ); }
+
+    //! wrap absolute vector to box of size p
+    vec3_t<T>& wrap_abs( T p = 1.0 ) noexcept{
+        for( auto& x : data_ ) x = std::fmod( 2*p + x, p );
+        return *this;
+    }
+
+    //! wrap relative vector to box of size p
+    vec3_t<T>& wrap_rel( T p = 1.0 ) noexcept{
+        for( auto& x : data_ ) x = (x<-p/2)? x+p : (x>=p/2)? x-p : x;
+        return *this;
+    }
+
+    //! ordering, allows 3d sorting of vec3_ts
+    bool operator<( const vec3_t<T>& o ) const noexcept{
+        if( x!=o.x ) return x<o.x?true:false;
+        if( y!=o.y ) return y<o.y?true:false;
+        if( z!=o.z ) return z<o.z?true:false;
+        return false;
+    }
+};
+
+//! multiplication with scalar
+template<typename T>
+vec3_t<T> operator*( T s, const vec3_t<T>& v ){
+    return vec3_t<T>({v.x*s,v.y*s,v.z*s});
+}
--- a/include/operators.hh
+++ b/include/operators.hh
@ -1,9 +1,54 @@
 #pragma once
+/*
+ 
+ operators.hh - This file is part of MUSIC2 -
+ a code to generate multi-scale initial conditions 
+ for cosmological simulations 
+ 
+ Copyright (C) 2019  Oliver Hahn
+ 
+*/
+#include <general.hh>

 namespace op{
-inline auto assign_to = [](auto &g){return [&](auto i, auto v){ g[i] = v; };};
-inline auto add_to = [](auto &g){return [&](auto i, auto v){ g[i] += v; };};
-inline auto add_twice_to = [](auto &g){return [&](auto i, auto v){ g[i] += 2*v; };};
-inline auto subtract_from = [](auto &g){return [&](auto i, auto v){ g[i] -= v; };};
-inline auto subtract_twice_from = [](auto &g){return [&](auto i, auto v){ g[i] -= 2*v; };};
+
+//!== list of primitive operators to work on fields ==!//
+
+template< typename field>
+inline auto assign_to( field& g ){return [&g](auto i, auto v){ g[i] = v; };}
+
+template< typename field, typename val >
+inline auto multiply_add_to( field& g, val x ){return [&g,x](auto i, auto v){ g[i] += v*x; };}
+
+template< typename field>
+inline auto add_to( field& g ){return [&g](auto i, auto v){ g[i] += v; };}
+
+template< typename field>
+inline auto subtract_from( field& g ){return [&g](auto i, auto v){ g[i] -= v; };}
+
+//! vanilla standard gradient
+class fourier_gradient{
+private:
+    real_t boxlen_, k0_;
+    size_t n_, nhalf_;
+public:
+    explicit fourier_gradient( const config_file& the_config )
+    : boxlen_( the_config.get_value<double>("setup", "BoxLength") ), 
+      k0_(2.0*M_PI/boxlen_),
+      n_( the_config.get_value<size_t>("setup","GridRes") ),
+      nhalf_( n_/2 )
+    {}
+
+    inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
+    {
+        real_t rgrad = 
+            (ijk[idim]!=nhalf_)? (real_t(ijk[idim]) - real_t(ijk[idim] > nhalf_) * n_) : 0.0; 
+        return ccomplex_t(0.0,rgrad * k0_);
+    }
+
+    inline real_t vfac_corr( std::array<size_t,3> ijk ) const
+    {
+        return 1.0;
+    }
+};
 }
--- a/include/output_plugin.hh
+++ b/include/output_plugin.hh
@ -21,11 +21,12 @@

 enum class output_type {particles,field_lagrangian,field_eulerian};

+
 class output_plugin
 {
 protected:
-	//! reference to the ConfigFile object that holds all configuration options
-	ConfigFile &cf_;
+	//! reference to the config_file object that holds all configuration options
+	config_file &cf_;

 	//! output file or directory name
 	std::string fname_;
@ -34,17 +35,17 @@ protected:
 	std::string interface_name_;
 public:
 	//! constructor
-	output_plugin(ConfigFile &cf, std::string interface_name )
+	output_plugin(config_file &cf, std::string interface_name )
 		: cf_(cf), interface_name_(interface_name)
 	{
-		fname_ = cf_.GetValue<std::string>("output", "filename");
+		fname_ = cf_.get_value<std::string>("output", "filename");
 	}

 	//! virtual destructor
 	virtual ~output_plugin(){}

 	//! routine to write particle data for a species
-	virtual void write_particle_data(const particle::container &pc, const cosmo_species &s ) {};
+	virtual void write_particle_data(const particle::container &pc, const cosmo_species &s, double Omega_species ) {};

 	//! routine to write gridded fluid component data for a species
 	virtual void write_grid_data(const Grid_FFT<real_t> &g, const cosmo_species &s, const fluid_component &c ) {};
@ -58,6 +59,12 @@ public:
 	//! routine to query whether species is written as particle data
 	// virtual bool write_species_as_particles( const cosmo_species &s ){ return !write_species_as_grid(s); }

+	//! query if output wants 64bit precision for real values
+	virtual bool has_64bit_reals() const = 0;
+
+	//! query if output wants 64bit precision for integer values
+	virtual bool has_64bit_ids() const = 0;
+	
 	//! routine to return a multiplicative factor that contains the desired position units for the output
 	virtual real_t position_unit() const = 0;

@ -71,7 +78,7 @@ public:
 struct output_plugin_creator
 {
 	//! create an instance of a plug-in
-	virtual std::unique_ptr<output_plugin> create(ConfigFile &cf) const = 0;
+	virtual std::unique_ptr<output_plugin> create(config_file &cf) const = 0;

 	//! destroy an instance of a plug-in
 	virtual ~output_plugin_creator() {}
@ -96,12 +103,12 @@ struct output_plugin_creator_concrete : public output_plugin_creator
 	}

 	//! create an instance of the plug-in
-	std::unique_ptr<output_plugin> create(ConfigFile &cf) const
+	std::unique_ptr<output_plugin> create(config_file &cf) const
 	{
 		return std::make_unique<Derived>(cf); // Derived( cf );
 	}
 };

 //! failsafe version to select the output plug-in
-std::unique_ptr<output_plugin> select_output_plugin(ConfigFile &cf);
+std::unique_ptr<output_plugin> select_output_plugin(config_file &cf);

--- a/include/particle_container.hh
+++ b/include/particle_container.hh
@ -1,3 +1,10 @@
+/*******************************************************************\
+ particle_container.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    10/2019 - Oliver Hahn - first implementation
+\*******************************************************************/
 #pragma once

 #ifdef USE_MPI
@ -13,57 +20,96 @@ namespace particle{
 class container
 {
 public:
-	std::vector<float> positions_, velocities_;
-	std::vector<int> ids_;
+	std::vector<float > positions32_, velocities32_;
+	std::vector<double> positions64_, velocities64_;
 	
-	container()
-	{
-	}
+	std::vector<uint32_t> ids32_;
+	std::vector<uint64_t> ids64_;
+	
+
+	container(){ }

 	container(const container &) = delete;

-	const void* get_pos_ptr() const{
-		return reinterpret_cast<const void*>( &positions_[0] );
-	}
-
-	const void* get_vel_ptr() const{
-		return reinterpret_cast<const void*>( &velocities_[0] );
-	}
-
-	const void* get_ids_ptr() const{
-		return reinterpret_cast<const void*>( &ids_[0] );
-	}
-
-	void allocate(size_t nump)
+	void allocate(size_t nump, bool b64reals, bool b64ids)
 	{
-		positions_.resize(3 * nump);
-		velocities_.resize(3 * nump);
-		ids_.resize(nump);
+		if( b64reals ){
+			positions64_.resize(3 * nump);
+			velocities64_.resize(3 * nump);
+			positions32_.clear();
+			velocities32_.clear();
+		}else{
+			positions32_.resize(3 * nump);
+			velocities32_.resize(3 * nump);
+			positions64_.clear();
+			velocities64_.clear();
+		}
+
+		if( b64ids ){
+			ids64_.resize(nump);
+			ids32_.clear();
+		}else{
+			ids32_.resize(nump);
+			ids64_.clear();
+		}
 	}

-	void set_pos(size_t ipart, size_t idim, real_t p)
-	{
-		positions_[3 * ipart + idim] = p;
+	const void* get_pos32_ptr() const{
+		return reinterpret_cast<const void*>( &positions32_[0] );
 	}

-	void set_vel(size_t ipart, size_t idim, real_t p)
-	{
-		velocities_[3 * ipart + idim] = p;
+	void set_pos32(size_t ipart, size_t idim, float p){
+		positions32_[3 * ipart + idim] = p;
 	}

-	void set_id(size_t ipart, id_t id)
-	{
-		ids_[ipart] = id;
+	const void* get_pos64_ptr() const{
+		return reinterpret_cast<const void*>( &positions64_[0] );
+	}
+
+	inline void set_pos64(size_t ipart, size_t idim, double p){
+		positions64_[3 * ipart + idim] = p;
+	}
+
+	inline const void* get_vel32_ptr() const{
+		return reinterpret_cast<const void*>( &velocities32_[0] );
+	}
+	
+	inline void set_vel32(size_t ipart, size_t idim, float p){
+		velocities32_[3 * ipart + idim] = p;
+	}
+
+	const void* get_vel64_ptr() const{
+		return reinterpret_cast<const void*>( &velocities64_[0] );
+	}
+
+	inline void set_vel64(size_t ipart, size_t idim, double p){
+		velocities64_[3 * ipart + idim] = p;
+	}
+
+	const void* get_ids32_ptr() const{
+		return reinterpret_cast<const void*>( &ids32_[0] );
+	}
+
+	void set_id32(size_t ipart, uint32_t id){
+		ids32_[ipart] = id;
+	}
+
+	const void* get_ids64_ptr() const{
+		return reinterpret_cast<const void*>( &ids64_[0] );
+	}
+
+	void set_id64(size_t ipart, uint64_t id){
+		ids64_[ipart] = id;
 	}

 	size_t get_local_num_particles(void) const
 	{
-		return ids_.size();
+		return std::max(ids32_.size(),ids64_.size());
 	}

 	size_t get_global_num_particles(void) const
 	{
-		size_t local_nump = ids_.size(), global_nump;
+		size_t local_nump = this->get_local_num_particles(), global_nump;
 #ifdef USE_MPI
 		MPI_Allreduce(reinterpret_cast<void *>(&local_nump), reinterpret_cast<void *>(&global_nump), 1,
 					  MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
@ -97,11 +143,11 @@ public:

 	void dump(void)
 	{
-		for (size_t i = 0; i < ids_.size(); ++i)
+		/*for (size_t i = 0; i < ids_.size(); ++i)
 		{
 			std::cout << positions_[3 * i + 0] << " " << positions_[3 * i + 1] << " " << positions_[3 * i + 2] << " "
 					  << velocities_[3 * i + 0] << " " << velocities_[3 * i + 1] << " " << velocities_[3 * i + 2] << std::endl;
-		}
+		}*/
 	}
 };

--- a/include/particle_generator.hh
+++ b/include/particle_generator.hh
@ -1,150 +1,325 @@
+/*******************************************************************\
+ particle_generator.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    10/2019 - Oliver Hahn - first implementation
+\*******************************************************************/
 #pragma once

-namespace particle {
+#include <math/vec3.hh>
+#include <grid_interpolate.hh>

-enum lattice{
-    lattice_sc=0, lattice_bcc=1, lattice_fcc=2
-};
+#if defined(USE_HDF5)
+#include "HDF_IO.hh"
+#endif

-template<typename field_t>
-void initialize_lattice( container& particles, lattice lattice_type, const field_t& field ){
-    const size_t num_p_in_load = field.local_size();
-    const size_t overload = 1<<lattice_type; // 1 for sc, 2 for bcc, 4 for fcc
-
-    particles.allocate( overload * num_p_in_load );
-
-    for( size_t i=0,ipcount=0; i<field.size(0); ++i ){
-        for( size_t j=0; j<field.size(1); ++j){
-            for( size_t k=0; k<field.size(2); ++k,++ipcount){
-                for( size_t iload=0; iload<overload; ++iload ){
-                    particles.set_id( ipcount+iload*num_p_in_load, overload*field.get_cell_idx_1d(i,j,k)+iload );
-                }
-            }
-        }
-    }
-}
-
-// invalidates field, phase shifted to unspecified position after return
-template<typename field_t>
-void set_positions( container& particles, lattice lattice_type, int idim, real_t lunit, field_t& field )
+namespace particle
 {
-    const size_t num_p_in_load = field.local_size();
+    using vec3 = std::array<real_t,3>;

-    for( size_t i=0,ipcount=0; i<field.size(0); ++i ){
-        for( size_t j=0; j<field.size(1); ++j){
-            for( size_t k=0; k<field.size(2); ++k){
-                auto pos = field.template get_unit_r<real_t>(i,j,k);
-                particles.set_pos( ipcount++, idim, pos[idim]*lunit + field.relem(i,j,k) );
-            }
-        }
-    }
+    enum lattice
+    {
+        lattice_glass = -1,
+        lattice_sc = 0,  // SC : simple cubic
+        lattice_bcc = 1, // BCC: body-centered cubic
+        lattice_fcc = 2, // FCC: face-centered cubic
+        lattice_rsc = 3, // RSC: refined simple cubic
+    };

-    if( lattice_type == particle::lattice_bcc ){
-        field.shift_field( 0.5, 0.5, 0.5 );
-        auto ipcount0 = num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    auto pos = field.template get_unit_r_shifted<real_t>(i,j,k,0.5,0.5,0.5);
-                    particles.set_pos( ipcount++, idim, pos[idim]*lunit + field.relem(i,j,k) );
-                }
-            }
-        }
-    }
-    else if( lattice_type == particle::lattice_fcc ){ 
-        // 0.5 0.5 0.0
-        field.shift_field( 0.5, 0.5, 0.0 );
-        auto ipcount0 = num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    auto pos = field.template get_unit_r_shifted<real_t>(i,j,k,0.5,0.5,0.0);
-                    particles.set_pos( ipcount++, idim, pos[idim]*lunit + field.relem(i,j,k) );
-                }
-            }
-        }
-        // 0.0 0.5 0.5
-        field.shift_field( -0.5, 0.0, 0.5 );
-        ipcount0 = 2*num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    auto pos = field.template get_unit_r_shifted<real_t>(i,j,k,0.0,0.5,0.5);
-                    particles.set_pos( ipcount++, idim, pos[idim]*lunit + field.relem(i,j,k) );
-                }
-            }
-        }
-        // 0.5 0.0 0.5
-        field.shift_field( 0.5, -0.5, 0.0 );
-        ipcount0 = 3*num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    auto pos = field.template get_unit_r_shifted<real_t>(i,j,k,0.5,0.0,0.5);
-                    particles.set_pos( ipcount++, idim, pos[idim]*lunit + field.relem(i,j,k) );
-                }
-            }
-        }
-    }
-}
+    const std::vector<std::vector<vec3_t<real_t>>> lattice_shifts =
+        {
+            // first shift must always be zero! (otherwise set_positions and set_velocities break)
+            /* SC : */ {{0.0, 0.0, 0.0}},
+            /* BCC: */ {{0.0, 0.0, 0.0}, {0.5, 0.5, 0.5}},
+            /* FCC: */ {{0.0, 0.0, 0.0}, {0.0, 0.5, 0.5}, {0.5, 0.0, 0.5}, {0.5, 0.5, 0.0}},
+            /* RSC: */ {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.5}, {0.0, 0.5, 0.0}, {0.0, 0.5, 0.5}, {0.5, 0.0, 0.0}, {0.5, 0.0, 0.5}, {0.5, 0.5, 0.0}, {0.5, 0.5, 0.5}},
+    };

-template<typename field_t>
-void set_velocities( container& particles, lattice lattice_type, int idim, field_t& field )
-{
-    const size_t num_p_in_load = field.local_size();
+    const std::vector<vec3_t<real_t>> second_lattice_shift =
+        {
+            /* SC : */ {0.5, 0.5, 0.5}, // this corresponds to CsCl lattice
+            /* BCC: */ {0.5, 0.5, 0.0}, // is there a diatomic lattice with BCC base?!?
+            /* FCC: */ {0.5, 0.5, 0.5}, // this corresponds to NaCl lattice
+                                        // /* FCC: */ {0.25, 0.25, 0.25}, // this corresponds to Zincblende/GaAs lattice
+            /* RSC: */ {0.25, 0.25, 0.25},
+    };

-    for( size_t i=0,ipcount=0; i<field.size(0); ++i ){
-        for( size_t j=0; j<field.size(1); ++j){
-            for( size_t k=0; k<field.size(2); ++k){
-                particles.set_vel( ipcount++, idim, field.relem(i,j,k) );
+    template <typename field_t>
+    class lattice_generator
+    {
+        protected:
+
+        struct glass
+        {
+            using data_t = typename field_t::data_t;
+            size_t num_p, off_p;
+            grid_interpolate<1, field_t> interp_;
+            std::vector<vec3> glass_posr;
+
+            glass( config_file& cf, const field_t &field )
+            : num_p(0), off_p(0), interp_( field )
+            {
+                std::vector<real_t> glass_pos;
+                real_t lglassbox = 1.0;
+
+                std::string glass_fname = cf.get_value<std::string>("setup", "GlassFileName");
+                size_t ntiles = cf.get_value<size_t>("setup", "GlassTiles");
+
+#if defined(USE_HDF5)
+                HDFReadGroupAttribute(glass_fname, "Header", "BoxSize", lglassbox);
+                HDFReadDataset(glass_fname, "/PartType1/Coordinates", glass_pos);
+#else
+                throw std::runtime_error("Class lattice requires HDF5 support. Enable and recompile.");
+#endif
+
+                size_t np_in_file = glass_pos.size() / 3;
+#if defined(USE_MPI)
+                num_p = np_in_file * ntiles * ntiles * ntiles / MPI::get_size();
+                off_p = MPI::get_rank() * num_p;
+#else
+                num_p = np_in_file * ntiles * ntiles * ntiles;
+                off_p = 0;
+#endif
+
+                music::ilog << "Glass file contains " << np_in_file << " particles." << std::endl;
+
+                glass_posr.assign(num_p, {0.0, 0.0, 0.0});
+
+                std::array<real_t, 3> ng({real_t(field.n_[0]), real_t(field.n_[1]), real_t(field.n_[2])});
+
+                #pragma omp parallel for
+                for (size_t i = 0; i < num_p; ++i)
+                {
+                    size_t idxpart = off_p + i;
+                    size_t idx_in_glass = idxpart % np_in_file;
+                    size_t idxtile = idxpart / np_in_file;
+                    size_t tile_z = idxtile % (ntiles * ntiles);
+                    size_t tile_y = ((idxtile - tile_z) / ntiles) % ntiles;
+                    size_t tile_x = (((idxtile - tile_z) / ntiles) - tile_y) / ntiles;
+                    glass_posr[i][0] = std::fmod((glass_pos[3 * idx_in_glass + 0] / lglassbox + real_t(tile_x)) / ntiles * ng[0] + ng[0], ng[0]);
+                    glass_posr[i][1] = std::fmod((glass_pos[3 * idx_in_glass + 1] / lglassbox + real_t(tile_y)) / ntiles * ng[1] + ng[1], ng[1]);
+                    glass_posr[i][2] = std::fmod((glass_pos[3 * idx_in_glass + 2] / lglassbox + real_t(tile_z)) / ntiles * ng[2] + ng[2], ng[2]);
+                }
+
+#if defined(USE_MPI)
+                interp_.domain_decompose_pos(glass_posr);
+
+                num_p = glass_posr.size();
+                std::vector<size_t> all_num_p( MPI::get_size(), 0 );
+                MPI_Allgather( &num_p, 1, MPI_UNSIGNED_LONG_LONG, &all_num_p[0], 1, MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD );
+                off_p = 0;
+                for( int itask=0; itask<=MPI::get_rank(); ++itask ){
+                    off_p += all_num_p[itask];
+                }
+#endif
            }
-        }
-    }

-    if( lattice_type == particle::lattice_bcc ){
-        field.shift_field( 0.5, 0.5, 0.5 );
-        auto ipcount0 = num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    particles.set_vel( ipcount++, idim, field.relem(i,j,k) );
+            void update_ghosts( const field_t &field )
+            {
+                interp_.update_ghosts( field );
+            }
+
+            data_t get_at( const vec3& x ) const noexcept
+            {
+                return interp_.get_cic_at( x );
+            }
+
+            size_t size() const noexcept
+            {
+                return num_p;
+            }
+
+            size_t offset() const noexcept
+            {
+                return off_p;
+            }
+        };
+
+        std::unique_ptr<glass> glass_ptr_;
+
+        private:
+        particle::container particles_;
+
+        public:
+        lattice_generator(lattice lattice_type, const bool b64reals, const bool b64ids, const size_t IDoffset, const field_t &field, config_file &cf)
+        {
+            if (lattice_type != lattice_glass)
+            {
+                // number of modes present in the field
+                const size_t num_p_in_load = field.local_size();
+                // unless SC lattice is used, particle number is a multiple of the number of modes (=num_p_in_load):
+                const size_t overload = 1ull << std::max<int>(0, lattice_type); // 1 for sc, 2 for bcc, 4 for fcc, 8 for rsc
+                // allocate memory for all local particles
+                particles_.allocate(overload * num_p_in_load, b64reals, b64ids);
+                // set particle IDs to the Lagrangian coordinate (1D encoded) with additionally the field shift encoded as well
+
+                for (size_t i = 0, ipcount = 0; i < field.size(0); ++i)
+                {
+                    for (size_t j = 0; j < field.size(1); ++j)
+                    {
+                        for (size_t k = 0; k < field.size(2); ++k, ++ipcount)
+                        {
+                            for (size_t iload = 0; iload < overload; ++iload)
+                            {
+                                if (b64ids)
+                                {
+                                    particles_.set_id64(ipcount + iload * num_p_in_load, IDoffset + overload * field.get_cell_idx_1d(i, j, k) + iload);
+                                }
+                                else
+                                {
+                                    particles_.set_id32(ipcount + iload * num_p_in_load, IDoffset + overload * field.get_cell_idx_1d(i, j, k) + iload);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                glass_ptr_ = std::make_unique<glass>( cf, field );
+                particles_.allocate(glass_ptr_->size(), b64reals, b64ids);
+
+                #pragma omp parallel for
+                for (size_t i = 0; i < glass_ptr_->size(); ++i)
+                {
+                    if (b64ids)
+                    {
+                        particles_.set_id64(i, IDoffset + i + glass_ptr_->offset());
+                    }
+                    else
+                    {
+                        particles_.set_id32(i, IDoffset + i + glass_ptr_->offset());
+                    }
                }
            }
        }
-    }
-    else if( lattice_type == particle::lattice_fcc ){ 
-        // 0.5 0.5 0.0
-        field.shift_field( 0.5, 0.5, 0.0 );
-        auto ipcount0 = num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    particles.set_vel( ipcount++, idim, field.relem(i,j,k) );
-                }
-            }
-        }
-        // 0.0 0.5 0.5
-        field.shift_field( -0.5, 0.0, 0.5 );
-        ipcount0 = 2*num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    particles.set_vel( ipcount++, idim, field.relem(i,j,k) );
-                }
-            }
-        }
-        // 0.5 0.0 0.5
-        field.shift_field( 0.5, -0.5, 0.0 );
-        ipcount0 = 3*num_p_in_load;
-        for( size_t i=0,ipcount=ipcount0; i<field.size(0); ++i ){
-            for( size_t j=0; j<field.size(1); ++j){
-                for( size_t k=0; k<field.size(2); ++k){
-                    particles.set_vel( ipcount++, idim, field.relem(i,j,k) );
-                }
-            }
-        }
-    }
-}

+        // invalidates field, phase shifted to unspecified position after return
+        void set_positions(const lattice lattice_type, bool is_second_lattice, int idim, real_t lunit, const bool b64reals, field_t &field, config_file &cf)
+        {
+            // works only for Bravais types
+            if (lattice_type >= 0)
+            {
+                const size_t num_p_in_load = field.local_size();
+                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
+                {
+                    // if we are dealing with the secondary lattice, apply a global shift
+                    if (ishift == 0 && is_second_lattice)
+                    {
+                        field.shift_field(second_lattice_shift[lattice_type]);
+                    }

-} // end namespace particles
+                    // can omit first shift since zero by convention, unless shifted already above, otherwise apply relative phase shift
+                    if (ishift > 0)
+                    {
+                        field.shift_field(lattice_shifts[lattice_type][ishift] - lattice_shifts[lattice_type][ishift - 1]);
+                    }
+                    // read out values from phase shifted field and set assoc. particle's value
+                    const auto ipcount0 = ishift * num_p_in_load;
+                    for (size_t i = 0, ipcount = ipcount0; i < field.size(0); ++i)
+                    {
+                        for (size_t j = 0; j < field.size(1); ++j)
+                        {
+                            for (size_t k = 0; k < field.size(2); ++k)
+                            {
+                                auto pos = field.template get_unit_r_shifted<real_t>(i, j, k, lattice_shifts[lattice_type][ishift] + (is_second_lattice ? second_lattice_shift[lattice_type] : vec3_t<real_t>{0., 0., 0.}));
+                                if (b64reals)
+                                {
+                                    particles_.set_pos64(ipcount++, idim, pos[idim] * lunit + field.relem(i, j, k));
+                                }
+                                else
+                                {
+                                    particles_.set_pos32(ipcount++, idim, pos[idim] * lunit + field.relem(i, j, k));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                glass_ptr_->update_ghosts( field );
+                #pragma omp parallel for
+                for (size_t i = 0; i < glass_ptr_->size(); ++i)
+                {
+                    auto pos = glass_ptr_->glass_posr[i];
+                    real_t disp = glass_ptr_->get_at(pos);
+                    if (b64reals)
+                    {
+                        particles_.set_pos64(i, idim, pos[idim] / field.n_[idim] * lunit + disp);
+                    }
+                    else
+                    {
+                        particles_.set_pos32(i, idim, pos[idim] / field.n_[idim] * lunit + disp);
+                    }
+                }
+            }
+        }
+
+        void set_velocities(lattice lattice_type, bool is_second_lattice, int idim, const bool b64reals, field_t &field, config_file &cf)
+        {
+            // works only for Bravais types
+            if (lattice_type >= 0)
+            {
+                const size_t num_p_in_load = field.local_size();
+                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
+                {
+                    // if we are dealing with the secondary lattice, apply a global shift
+                    if (ishift == 0 && is_second_lattice)
+                    {
+                        field.shift_field(second_lattice_shift[lattice_type]);
+                    }
+                    // can omit first shift since zero by convention, unless shifted already above, otherwise apply relative phase shift
+                    if (ishift > 0)
+                    {
+                        field.shift_field(lattice_shifts[lattice_type][ishift] - lattice_shifts[lattice_type][ishift - 1]);
+                    }
+                    // read out values from phase shifted field and set assoc. particle's value
+                    const auto ipcount0 = ishift * num_p_in_load;
+                    for (size_t i = 0, ipcount = ipcount0; i < field.size(0); ++i)
+                    {
+                        for (size_t j = 0; j < field.size(1); ++j)
+                        {
+                            for (size_t k = 0; k < field.size(2); ++k)
+                            {
+                                if (b64reals)
+                                {
+                                    particles_.set_vel64(ipcount++, idim, field.relem(i, j, k));
+                                }
+                                else
+                                {
+                                    particles_.set_vel32(ipcount++, idim, field.relem(i, j, k));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                glass_ptr_->update_ghosts( field );
+                #pragma omp parallel for
+                for (size_t i = 0; i < glass_ptr_->size(); ++i)
+                {
+                    auto pos = glass_ptr_->glass_posr[i];
+                    real_t vel = glass_ptr_->get_at(pos);
+                    if (b64reals)
+                    {
+                        particles_.set_vel64(i, idim, vel);
+                    }
+                    else
+                    {
+                        particles_.set_vel32(i, idim, vel);
+                    }
+                }
+            }
+        }
+
+        const particle::container& get_particles() const noexcept{
+            return particles_;
+        }
+
+    }; // struct lattice
+
+} // namespace particle
--- a/include/particle_plt.hh
+++ b/include/particle_plt.hh
@ -0,0 +1,568 @@
+#pragma once
+
+#include <general.hh>
+#include <unistd.h> // for unlink
+
+#include <iostream>
+#include <fstream>
+
+#include <random>
+#include <map>
+
+#include <cassert>
+
+#include <particle_generator.hh>
+#include <grid_fft.hh>
+#include <math/mat3.hh>
+
+#include <gsl/gsl_sf_hyperg.h>
+inline double Hypergeometric2F1( double a, double b, double c, double x )
+{
+  return gsl_sf_hyperg_2F1( a, b, c, x);
+}
+
+#define PRODUCTION
+
+namespace particle{
+//! implement Joyce, Marcos et al. PLT calculation
+
+class lattice_gradient{
+private:
+    const real_t boxlen_, aini_;
+    const size_t ngmapto_, ngrid_, ngrid32_;
+    const real_t mapratio_, XmL_;
+    Grid_FFT<real_t,false> D_xx_, D_xy_, D_xz_, D_yy_, D_yz_, D_zz_;
+    Grid_FFT<real_t,false> grad_x_, grad_y_, grad_z_;
+    std::vector<vec3_t<real_t>> vectk_;
+    std::vector<vec3_t<int>> ico_, vecitk_;
+
+    bool is_even( int i ){ return (i%2)==0; }
+
+    bool is_in( int i, int j, int k, const mat3_t<int>& M ){
+        vec3_t<int> v({i,j,k});
+        auto vv = M * v;
+        return is_even(vv.x)&&is_even(vv.y)&&is_even(vv.z);
+    }
+
+    void init_D( lattice lattice_type )
+    {
+        constexpr real_t pi     = M_PI;
+        constexpr real_t twopi  = 2.0*M_PI;
+        constexpr real_t fourpi = 4.0*M_PI;
+        const     real_t sqrtpi = std::sqrt(M_PI);
+        const     real_t pi32   = std::pow(M_PI,1.5);
+
+        //! === vectors, reciprocals and normals for the SC lattice ===
+        const int charge_fac_sc = 1;
+        const mat3_t<real_t> mat_bravais_sc{
+            1.0, 0.0, 0.0,
+            0.0, 1.0, 0.0,
+            0.0, 0.0, 1.0, 
+        };
+        const mat3_t<real_t> mat_reciprocal_sc{
+            twopi, 0.0, 0.0,
+            0.0, twopi, 0.0,
+            0.0, 0.0, twopi,
+        };
+        const mat3_t<int> mat_invrecip_sc{
+            2, 0, 0,
+            0, 2, 0,
+            0, 0, 2,
+        };
+        const std::vector<vec3_t<real_t>> normals_sc{
+            {pi,0.,0.},{-pi,0.,0.},
+            {0.,pi,0.},{0.,-pi,0.},
+            {0.,0.,pi},{0.,0.,-pi},
+        };
+        
+
+        //! === vectors, reciprocals and normals for the BCC lattice ===
+        const int charge_fac_bcc = 2;
+        const mat3_t<real_t> mat_bravais_bcc{
+            1.0, 0.0, 0.5,
+            0.0, 1.0, 0.5,
+            0.0, 0.0, 0.5, 
+        };
+        const mat3_t<real_t> mat_reciprocal_bcc{
+            twopi, 0.0, 0.0,
+            0.0, twopi, 0.0,
+            -twopi, -twopi, fourpi,
+        };
+        const mat3_t<int> mat_invrecip_bcc{
+            2, 0, 0,
+            0, 2, 0,
+            1, 1, 1,
+        };
+        const std::vector<vec3_t<real_t>> normals_bcc{
+            {0.,pi,pi},{0.,-pi,pi},{0.,pi,-pi},{0.,-pi,-pi},
+            {pi,0.,pi},{-pi,0.,pi},{pi,0.,-pi},{-pi,0.,-pi},
+            {pi,pi,0.},{-pi,pi,0.},{pi,-pi,0.},{-pi,-pi,0.}
+        };
+        
+
+        //! === vectors, reciprocals and normals for the FCC lattice ===
+        const int charge_fac_fcc = 4;
+        const mat3_t<real_t> mat_bravais_fcc{
+            0.0, 0.5, 0.0,
+            0.5, 0.0, 1.0,
+            0.5, 0.5, 0.0, 
+        };
+        const mat3_t<real_t> mat_reciprocal_fcc{
+            -fourpi, fourpi, twopi,
+            0.0, 0.0, twopi,
+            fourpi, 0.0, -twopi,
+        };
+        const mat3_t<int> mat_invrecip_fcc{
+            0, 1, 1,
+            1, 0, 1,
+            0, 2, 0,
+        };
+        const std::vector<vec3_t<real_t>> normals_fcc{
+            {twopi,0.,0.},{-twopi,0.,0.},
+            {0.,twopi,0.},{0.,-twopi,0.},
+            {0.,0.,twopi},{0.,0.,-twopi},
+            {+pi,+pi,+pi},{+pi,+pi,-pi},
+            {+pi,-pi,+pi},{+pi,-pi,-pi},
+            {-pi,+pi,+pi},{-pi,+pi,-pi},
+            {-pi,-pi,+pi},{-pi,-pi,-pi},
+        };
+        
+        //! select the properties for the chosen lattice
+        const int ilat = lattice_type; // 0 = sc, 1 = bcc, 2 = fcc
+
+        const auto mat_bravais     = (ilat==2)? mat_bravais_fcc : (ilat==1)? mat_bravais_bcc : mat_bravais_sc;
+        const auto mat_reciprocal  = (ilat==2)? mat_reciprocal_fcc : (ilat==1)? mat_reciprocal_bcc : mat_reciprocal_sc;
+        const auto mat_invrecip    = (ilat==2)? mat_invrecip_fcc : (ilat==1)? mat_invrecip_bcc : mat_invrecip_sc;
+        const auto normals         = (ilat==2)? normals_fcc : (ilat==1)? normals_bcc : normals_sc;
+        const auto charge_fac      = (ilat==2)? charge_fac_fcc : (ilat==1)? charge_fac_bcc : charge_fac_sc;
+
+        const ptrdiff_t nlattice = ngrid_;
+        const real_t dx = 1.0/real_t(nlattice);
+
+        const real_t eta = 4.0; // Ewald cutoff shall be 4 cells
+        const real_t alpha = 1.0/std::sqrt(2)/eta;
+        const real_t alpha2 = alpha*alpha;
+        const real_t alpha3 = alpha2*alpha;
+        
+        const real_t charge = 1.0/std::pow(real_t(nlattice),3)/charge_fac;
+        const real_t fft_norm   = 1.0/std::pow(real_t(nlattice),3.0);
+        const real_t fft_norm12 = 1.0/std::pow(real_t(nlattice),1.5);
+
+        //! just a Kronecker \delta_ij
+        auto kronecker = []( int i, int j ) -> real_t { return (i==j)? 1.0 : 0.0; };
+
+        //! Ewald summation: short-range Green's function
+        auto add_greensftide_sr = [&]( mat3_t<real_t>& D, const vec3_t<real_t>& d ) -> void {
+            auto r = d.norm();
+            if( r< 1e-14 ) return; // return zero for r=0
+
+            const real_t r2(r*r), r3(r2*r), r5(r3*r2);
+            const real_t K1( -alpha3/pi32 * std::exp(-alpha2*r2)/r2 );
+            const real_t K2( (std::erfc(alpha*r) + 2.0*alpha/sqrtpi*std::exp(-alpha2*r2)*r)/fourpi );
+            
+            for( int mu=0; mu<3; ++mu ){
+                for( int nu=mu; nu<3; ++nu ){
+                    real_t dd( d[mu]*d[nu] * K1 + (kronecker(mu,nu)/r3 - 3.0 * (d[mu]*d[nu])/r5) * K2 );
+                    D(mu,nu) += dd;
+                    D(nu,mu) += (mu!=nu)? dd : 0.0;
+                }
+            }
+        };
+
+        //! Ewald summation: long-range Green's function
+        auto add_greensftide_lr = [&]( mat3_t<real_t>& D, const vec3_t<real_t>& k, const vec3_t<real_t>& r ) -> void {
+            real_t kmod2 = k.norm_squared();
+            real_t term = std::exp(-kmod2/(4*alpha2))*std::cos(k.dot(r)) / kmod2 * fft_norm;
+            for( int mu=0; mu<3; ++mu ){
+                for( int nu=mu; nu<3; ++nu ){
+                    auto dd = k[mu] * k[nu] * term;
+                    D(mu,nu) += dd;
+                    D(nu,mu) += (mu!=nu)? dd : 0.0;
+                }
+            }
+        };
+
+        //! checks if 'vec' is in the FBZ with FBZ normal vectors given in 'normals'
+        auto check_FBZ = []( const auto& normals, const auto& vec ) -> bool {
+            for( const auto& n : normals ){ 
+                if( n.dot( vec ) > 1.0001 * n.dot(n) ){
+                    return false;
+                }
+            }
+            return true;
+        };
+        
+        constexpr ptrdiff_t lnumber = 3, knumber = 3;
+        const int numb = 1; //!< search radius when shifting vectors into FBZ
+
+        vectk_.assign(D_xx_.memsize(),vec3_t<real_t>());
+        ico_.assign(D_xx_.memsize(),vec3_t<int>());
+        vecitk_.assign(D_xx_.memsize(),vec3_t<int>());
+
+        #pragma omp parallel 
+        {
+            //... temporary to hold values of the dynamical matrix 
+            mat3_t<real_t> matD(0.0);
+
+            #pragma omp for
+            for( ptrdiff_t i=0; i<nlattice; ++i ){
+                for( ptrdiff_t j=0; j<nlattice; ++j ){
+                    for( ptrdiff_t k=0; k<nlattice; ++k ){
+                        // compute lattice site vector from (i,j,k) multiplying Bravais base matrix, and wrap back to box
+                        const vec3_t<real_t> x_ijk({dx*real_t(i),dx*real_t(j),dx*real_t(k)});
+                        const vec3_t<real_t> ar = (mat_bravais * x_ijk).wrap_abs();
+
+                        //... zero temporary matrix
+                        matD.zero();        
+
+                        // add real-space part of dynamical matrix, periodic copies
+                        for( ptrdiff_t ix=-lnumber; ix<=lnumber; ix++ ){
+                            for( ptrdiff_t iy=-lnumber; iy<=lnumber; iy++ ){
+                                for( ptrdiff_t iz=-lnumber; iz<=lnumber; iz++ ){      
+                                    const vec3_t<real_t> n_ijk({real_t(ix),real_t(iy),real_t(iz)});            
+                                    const vec3_t<real_t> dr(ar - mat_bravais * n_ijk);
+                                    add_greensftide_sr(matD, dr);
+                                }
+                            }
+                        }
+
+                        // add k-space part of dynamical matrix
+                        for( ptrdiff_t ix=-knumber; ix<=knumber; ix++ ){
+                            for( ptrdiff_t iy=-knumber; iy<=knumber; iy++ ){
+                                for( ptrdiff_t iz=-knumber; iz<=knumber; iz++ ){                      
+                                    if(std::abs(ix)+std::abs(iy)+std::abs(iz) != 0){
+                                        const vec3_t<real_t> k_ijk({real_t(ix)/nlattice,real_t(iy)/nlattice,real_t(iz)/nlattice});
+                                        const vec3_t<real_t> ak( mat_reciprocal * k_ijk);
+
+                                        add_greensftide_lr(matD, ak, ar );
+                                    }
+                                }
+                            }
+                        } 
+
+                        D_xx_.relem(i,j,k) = matD(0,0) * charge;
+                        D_xy_.relem(i,j,k) = matD(0,1) * charge;
+                        D_xz_.relem(i,j,k) = matD(0,2) * charge;
+                        D_yy_.relem(i,j,k) = matD(1,1) * charge;
+                        D_yz_.relem(i,j,k) = matD(1,2) * charge;
+                        D_zz_.relem(i,j,k) = matD(2,2) * charge;
+                    }
+                }
+            }
+        } // end omp parallel region
+
+        // fix r=0 with background density (added later in Fourier space)
+        D_xx_.relem(0,0,0) = 1.0/3.0;
+        D_xy_.relem(0,0,0) = 0.0;
+        D_xz_.relem(0,0,0) = 0.0;
+        D_yy_.relem(0,0,0) = 1.0/3.0;
+        D_yz_.relem(0,0,0) = 0.0;
+        D_zz_.relem(0,0,0) = 1.0/3.0;
+
+        D_xx_.FourierTransformForward();
+        D_xy_.FourierTransformForward();
+        D_xz_.FourierTransformForward();
+        D_yy_.FourierTransformForward();
+        D_yz_.FourierTransformForward();
+        D_zz_.FourierTransformForward();
+
+#ifndef PRODUCTION
+        if (CONFIG::MPI_task_rank == 0)
+            unlink("debug.hdf5");
+        D_xx_.Write_to_HDF5("debug.hdf5","Dxx");
+        D_xy_.Write_to_HDF5("debug.hdf5","Dxy");
+        D_xz_.Write_to_HDF5("debug.hdf5","Dxz");
+        D_yy_.Write_to_HDF5("debug.hdf5","Dyy");
+        D_yz_.Write_to_HDF5("debug.hdf5","Dyz");
+        D_zz_.Write_to_HDF5("debug.hdf5","Dzz");
+
+        std::ofstream ofs2("test_brillouin.txt");
+#endif
+        using map_t = std::map<vec3_t<int>,size_t>;
+        map_t iimap;
+            
+        //!=== Make temporary copies before resorting to std. Fourier grid ========!//
+        Grid_FFT<real_t,false> 
+            temp1({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+            temp2({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+            temp3({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0});
+
+        temp1.FourierTransformForward(false);
+        temp2.FourierTransformForward(false);
+        temp3.FourierTransformForward(false);
+            
+        #pragma omp parallel for
+        for( size_t i=0; i<D_xx_.size(0); i++ )
+        {
+            for( size_t j=0; j<D_xx_.size(1); j++ )
+            {
+                for( size_t k=0; k<D_xx_.size(2); k++ )
+                {
+                    temp1.kelem(i,j,k) = ccomplex_t(std::real(D_xx_.kelem(i,j,k)),std::real(D_xy_.kelem(i,j,k)));
+                    temp2.kelem(i,j,k) = ccomplex_t(std::real(D_xz_.kelem(i,j,k)),std::real(D_yy_.kelem(i,j,k)));
+                    temp3.kelem(i,j,k) = ccomplex_t(std::real(D_yz_.kelem(i,j,k)),std::real(D_zz_.kelem(i,j,k)));
+                }
+            }
+        }
+        D_xx_.zero(); D_xy_.zero(); D_xz_.zero();
+        D_yy_.zero(); D_yz_.zero(); D_zz_.zero();
+
+        
+        //!=== Diagonalise and resort to std. Fourier grid ========!//
+        #pragma omp parallel 
+        {
+            // thread private matrix representation
+            mat3_t<real_t> D;
+            vec3_t<real_t> eval, evec1, evec2, evec3_t;
+
+            #pragma omp for
+            for( size_t i=0; i<D_xx_.size(0); i++ )
+            {
+                for( size_t j=0; j<D_xx_.size(1); j++ )
+                {
+                    for( size_t k=0; k<D_xx_.size(2); k++ )
+                    {
+                        vec3_t<real_t> kv = D_xx_.get_k<real_t>(i,j,k);
+                        
+                        // put matrix elements into actual matrix
+                        D(0,0) = std::real(temp1.kelem(i,j,k)) / fft_norm12;
+                        D(0,1) = D(1,0) = std::imag(temp1.kelem(i,j,k)) / fft_norm12;
+                        D(0,2) = D(2,0) = std::real(temp2.kelem(i,j,k)) / fft_norm12;
+                        D(1,1) = std::imag(temp2.kelem(i,j,k)) / fft_norm12;
+                        D(1,2) = D(2,1) = std::real(temp3.kelem(i,j,k)) / fft_norm12;
+                        D(2,2) = std::imag(temp3.kelem(i,j,k)) / fft_norm12;
+
+                        // compute eigenstructure of matrix
+                        D.eigen(eval, evec1, evec2, evec3_t);
+                        evec3_t /= (twopi*ngrid_);
+
+                        // now determine to which modes on the regular lattice this contributes
+                        vec3_t<real_t> ar = kv / (twopi*ngrid_);
+                        vec3_t<real_t> a(mat_reciprocal * ar);
+                        
+                        // translate the k-vectors into the "candidate" FBZ
+                        for( int l1=-numb; l1<=numb; ++l1 ){
+                            for( int l2=-numb; l2<=numb; ++l2 ){
+                                for( int l3=-numb; l3<=numb; ++l3 ){
+                                    // need both halfs of Fourier space since we use real transforms
+                                    for( int isign=0; isign<=1; ++isign ){
+                                        const real_t sign = 2.0*real_t(isign)-1.0; 
+                                        const vec3_t<real_t> vshift({real_t(l1),real_t(l2),real_t(l3)});
+
+                                        vec3_t<real_t> vectk = sign * a + mat_reciprocal * vshift;
+
+                                        if( check_FBZ( normals, vectk ) )
+                                        {
+                                            int ix = std::round(vectk.x*(ngrid_)/twopi);
+                                            int iy = std::round(vectk.y*(ngrid_)/twopi);
+                                            int iz = std::round(vectk.z*(ngrid_)/twopi);
+
+                                            #pragma omp critical
+                                            {iimap.insert( std::pair<vec3_t<int>,size_t>({ix,iy,iz}, D_xx_.get_idx(i,j,k)) );}
+
+                                            temp1.kelem(i,j,k) = ccomplex_t(eval[2],eval[1]);
+                                            temp2.kelem(i,j,k) = ccomplex_t(eval[0],evec3_t.x);
+                                            temp3.kelem(i,j,k) = ccomplex_t(evec3_t.y,evec3_t.z);
+                                        }
+                                    }//sign
+                                } //l3
+                            } //l2
+                        } //l1
+                    } //k
+                } //j
+            } //i
+        }
+
+        D_xx_.kelem(0,0,0) = 1.0;
+        D_xy_.kelem(0,0,0) = 0.0;
+        D_xz_.kelem(0,0,0) = 0.0;
+
+        D_yy_.kelem(0,0,0) = 1.0;
+        D_yz_.kelem(0,0,0) = 0.0;
+        D_zz_.kelem(0,0,0) = 0.0;
+
+        //... approximate infinite lattice by inerpolating to sites not convered by current resolution...
+        #pragma omp parallel for
+        for( size_t i=0; i<D_xx_.size(0); i++ ){
+            for( size_t j=0; j<D_xx_.size(1); j++ ){
+                for( size_t k=0; k<D_xx_.size(2); k++ ){
+                    int ii = (int(i)>nlattice/2)? int(i)-nlattice : int(i);
+                    int jj = (int(j)>nlattice/2)? int(j)-nlattice : int(j);
+                    int kk = (int(k)>nlattice/2)? int(k)-nlattice : int(k);
+                    vec3_t<real_t> kv({real_t(ii),real_t(jj),real_t(kk)});
+
+                    auto align_with_k = [&]( const vec3_t<real_t>& v ) -> vec3_t<real_t>{
+                        return v*((v.dot(kv)<0.0)?-1.0:1.0);
+                    };
+
+                    vec3_t<real_t> v, l;
+                    map_t::iterator it;
+                    
+                    if( !is_in(i,j,k,mat_invrecip)  ){
+                        auto average_lv = [&]( const auto& t1, const auto& t2, const auto& t3, vec3_t<real_t>& v, vec3_t<real_t>& l ) {
+                            v = 0.0; l = 0.0;
+                            int count(0);
+                            
+                            auto add_lv = [&]( auto it ) -> void {
+                                auto q = it->second;++count;
+                                l += vec3_t<real_t>({std::real(t1.kelem(q)),std::imag(t1.kelem(q)),std::real(t2.kelem(q))});
+                                v += align_with_k(vec3_t<real_t>({std::imag(t2.kelem(q)),std::real(t3.kelem(q)),std::imag(t3.kelem(q))}));
+                            };
+                            map_t::iterator it;
+                            if( (it = iimap.find({ii-1,jj,kk}))!=iimap.end() ){ add_lv(it); }
+                            if( (it = iimap.find({ii+1,jj,kk}))!=iimap.end() ){ add_lv(it); }
+                            if( (it = iimap.find({ii,jj-1,kk}))!=iimap.end() ){ add_lv(it); }
+                            if( (it = iimap.find({ii,jj+1,kk}))!=iimap.end() ){ add_lv(it); }
+                            if( (it = iimap.find({ii,jj,kk-1}))!=iimap.end() ){ add_lv(it); }
+                            if( (it = iimap.find({ii,jj,kk+1}))!=iimap.end() ){ add_lv(it); }
+                            l/=real_t(count); v/=real_t(count);
+                        };
+                        
+                        average_lv(temp1,temp2,temp3,v,l);
+                        
+                    }else{
+                        if( (it = iimap.find({ii,jj,kk}))!=iimap.end() ){
+                            auto q = it->second;
+                            l = vec3_t<real_t>({std::real(temp1.kelem(q)),std::imag(temp1.kelem(q)),std::real(temp2.kelem(q))});
+                            v = align_with_k(vec3_t<real_t>({std::imag(temp2.kelem(q)),std::real(temp3.kelem(q)),std::imag(temp3.kelem(q))}));
+                        }
+                    }
+                    D_xx_.kelem(i,j,k) = l[0];
+                    D_xy_.kelem(i,j,k) = l[1];
+                    D_xz_.kelem(i,j,k) = l[2];
+                    D_yy_.kelem(i,j,k) = v[0];
+                    D_yz_.kelem(i,j,k) = v[1];
+                    D_zz_.kelem(i,j,k) = v[2];
+                }
+            }
+        }
+        
+#ifdef PRODUCTION
+        #pragma omp parallel for
+        for( size_t i=0; i<D_xx_.size(0); i++ ){
+            for( size_t j=0; j<D_xx_.size(1); j++ ){
+                for( size_t k=0; k<D_xx_.size(2); k++ )
+                {
+                    vec3_t<real_t> kv = D_xx_.get_k<real_t>(i,j,k);
+
+                    double mu1 = std::real(D_xx_.kelem(i,j,k));
+                    // double mu2 = std::real(D_xy_.kelem(i,j,k));
+                    // double mu3 = std::real(D_xz_.kelem(i,j,k));
+
+                    vec3_t<real_t> evec1({std::real(D_yy_.kelem(i,j,k)),std::real(D_yz_.kelem(i,j,k)),std::real(D_zz_.kelem(i,j,k))});
+                    evec1 /= evec1.norm();
+
+                    // ///////////////////////////////////
+                    // // project onto spherical coordinate vectors
+                    
+                    real_t kr = kv.norm(), kphi = kr>0.0? std::atan2(kv.y,kv.x) : 0.0, ktheta = kr>0.0? std::acos( kv.z / kr ): 0.0;
+                    real_t st = std::sin(ktheta), ct = std::cos(ktheta), sp = std::sin(kphi), cp = std::cos(kphi);
+                    vec3_t<real_t> e_r( st*cp, st*sp, ct), e_theta( ct*cp, ct*sp, -st), e_phi( -sp, cp, 0.0 );
+
+                    // re-normalise to that longitudinal amplitude is exact
+                    double renorm = evec1.dot( e_r ); if( renorm < 0.01 ) renorm = 1.0;
+
+                    // -- store in diagonal components of D_ij
+                    D_xx_.kelem(i,j,k) = 1.0;
+                    D_yy_.kelem(i,j,k) = evec1.dot( e_theta ) / renorm;
+                    D_zz_.kelem(i,j,k) = evec1.dot( e_phi ) / renorm;
+
+                    // spatially dependent correction to vfact = \dot{D_+}/D_+
+                    D_xy_.kelem(i,j,k) = 1.0/(0.25*(std::sqrt(1.+24*mu1)-1.));
+                }
+            }
+        }
+        D_xy_.kelem(0,0,0) = 1.0;
+        D_xx_.kelem(0,0,0) = 1.0;
+        D_yy_.kelem(0,0,0) = 0.0;
+        D_zz_.kelem(0,0,0) = 0.0;
+
+        // unlink("debug.hdf5");
+        // D_xy_.Write_to_HDF5("debug.hdf5","mu1");
+        // D_xx_.Write_to_HDF5("debug.hdf5","e1x");
+        // D_yy_.Write_to_HDF5("debug.hdf5","e1y");
+        // D_zz_.Write_to_HDF5("debug.hdf5","e1z");
+
+#else
+        D_xx_.Write_to_HDF5("debug.hdf5","mu1");
+        D_xy_.Write_to_HDF5("debug.hdf5","mu2");
+        D_xz_.Write_to_HDF5("debug.hdf5","mu3");
+        D_yy_.Write_to_HDF5("debug.hdf5","e1x");
+        D_yz_.Write_to_HDF5("debug.hdf5","e1y");
+        D_zz_.Write_to_HDF5("debug.hdf5","e1z");
+#endif   
+    }
+
+
+public:
+    // real_t boxlen, size_t ngridother
+    explicit lattice_gradient( config_file& the_config, size_t ngridself=64 )
+    : boxlen_( the_config.get_value<double>("setup", "BoxLength") ), 
+      aini_ ( 1.0/(1.0+the_config.get_value<double>("setup", "zstart")) ),
+      ngmapto_( the_config.get_value<size_t>("setup", "GridRes") ), 
+      ngrid_( ngridself ), ngrid32_( std::pow(ngrid_, 1.5) ), mapratio_(real_t(ngrid_)/real_t(ngmapto_)),
+      XmL_ ( the_config.get_value<double>("cosmology", "Omega_L") / the_config.get_value<double>("cosmology", "Omega_m") ),
+      D_xx_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}), D_xy_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+      D_xz_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}), D_yy_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+      D_yz_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}), D_zz_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+      grad_x_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}), grad_y_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0}),
+      grad_z_({ngrid_, ngrid_, ngrid_}, {1.0,1.0,1.0})
+    { 
+        music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        std::string lattice_str = the_config.get_value_safe<std::string>("setup","ParticleLoad","sc");
+        const lattice lattice_type = 
+            ((lattice_str=="bcc")? lattice_bcc 
+            : ((lattice_str=="fcc")? lattice_fcc 
+            : ((lattice_str=="rsc")? lattice_rsc 
+            : lattice_sc)));
+
+        music::ilog << "PLT corrections for " << lattice_str << " lattice will be computed on " << ngrid_ << "**3 mesh" << std::endl;
+
+        double wtime = get_wtime();
+        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing PLT eigenmodes "<< std::flush;
+        
+        init_D( lattice_type );
+        // init_D__old();
+
+        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
+    }
+
+    inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
+    {
+        real_t ix = ijk[0]*mapratio_, iy = ijk[1]*mapratio_, iz = ijk[2]*mapratio_;
+
+        auto kv = D_xx_.get_k<real_t>( ix, iy, iz );
+        auto kmod = kv.norm() / mapratio_ / boxlen_;
+
+        // // project onto spherical coordinate vectors
+        auto D_r = std::real(D_xx_.get_cic_kspace({ix,iy,iz}));
+        auto D_theta = std::real(D_yy_.get_cic_kspace({ix,iy,iz}));
+        auto D_phi = std::real(D_zz_.get_cic_kspace({ix,iy,iz}));
+        
+        real_t kr = kv.norm(), kphi = kr>0.0? std::atan2(kv.y,kv.x) : 0.0, ktheta = kr>0.0? std::acos( kv.z / kr ) : 0.0;
+        real_t st = std::sin(ktheta), ct = std::cos(ktheta), sp = std::sin(kphi), cp = std::cos(kphi);
+        
+        if( idim == 0 ){
+            return ccomplex_t(0.0, kmod*(D_r * st * cp + D_theta * ct * cp - D_phi * sp)); 
+        }
+        else if( idim == 1 ){
+            return ccomplex_t(0.0, kmod*(D_r  * st * sp + D_theta * ct * sp + D_phi * cp)); 
+        }
+        return ccomplex_t(0.0, kmod*(D_r  * ct - D_theta * st)); 
+    }
+
+    inline real_t vfac_corr( std::array<size_t,3> ijk  ) const
+    {
+        real_t ix = ijk[0]*mapratio_, iy = ijk[1]*mapratio_, iz = ijk[2]*mapratio_;
+        const real_t alpha = 1.0/std::real(D_xy_.get_cic_kspace({ix,iy,iz}));
+        return 1.0/alpha;
+        // // below is for LCDM, but it is a tiny correction for typical starting redshifts:
+        //! X = \Omega_\Lambda / \Omega_m
+        // return 1.0 / (alpha - (2*std::pow(aini_,3)*alpha*(2 + alpha)*XmL_*Hypergeometric2F1((3 + alpha)/3.,(5 + alpha)/3.,
+        //     (13 + 4*alpha)/6.,-(std::pow(aini_,3)*XmL_)))/
+        //     ((7 + 4*alpha)*Hypergeometric2F1(alpha/3.,(2 + alpha)/3.,(7 + 4*alpha)/6.,-(std::pow(aini_,3)*XmL_))));
+    }
+
+};
+
+}
--- a/include/physical_constants.hh
+++ b/include/physical_constants.hh
@ -0,0 +1,62 @@
+#pragma once
+/*******************************************************************************\
+ physical_constants.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    06/2019 - Oliver Hahn - first implementation
+\*******************************************************************************/
+
+// physical constants for convenience, all values have been taken from
+// the 2018 edition of the Particle Data Group Booklet,
+// http://pdg.lbl.gov/2019/mobile/reviews/pdf/rpp2018-rev-phys-constants-m.pdf
+
+namespace phys_const
+{
+// helper value of pi so that we don't need to include any other header just for this
+static constexpr double pi_ = 3.141592653589793115997963468544185161590576171875;
+
+//--- unit conversions ---------------------------------------------------
+
+// 1 Mpc in m
+static constexpr double Mpc_SI = 3.0857e22;
+
+// 1 Gyr in s
+static constexpr double Gyr_SI = 3.1536e16;
+
+// 1 eV in J
+static constexpr double eV_SI = 1.602176487e-19;
+
+// 1 erg in J
+static constexpr double erg_SI = 1e-7;
+
+//--- physical constants ------------------------------------------------
+
+// speed of light c in m/s
+static constexpr double c_SI = 2.99792458e8;
+
+// gravitational constant G in m^3/s^2/kg
+static constexpr double G_SI = 6.6740800e-11;
+
+// Boltzmann constant k_B in kg m^2/s^2/K
+static constexpr double kB_SI = 1.38064852e-23;
+
+// reduced Planck's quantum \hbar in kg m^2/s
+static constexpr double hbar_SI = 1.054571800e-34;
+
+// Stefan-Boltzmann constant sigma in J/m^2/s/K^-4
+static constexpr double sigma_SI = (pi_ * pi_) * (kB_SI * kB_SI * kB_SI * kB_SI) / 60. / (hbar_SI * hbar_SI * hbar_SI) / (c_SI * c_SI);
+
+// electron mass in kg
+static constexpr double me_SI = 9.10938356e-31;
+
+// proton mass in kg
+static constexpr double mp_SI = 1.672621898e-27;
+
+// unified atomic mass unit (u) in kg
+static constexpr double u_SI = 1.660539040e-27;
+
+// critical density of the Universe in h^2 kg/m^3
+static constexpr double rhocrit_h2_SI = 3 * 1e10 / (8 * pi_ * G_SI) / Mpc_SI / Mpc_SI;
+
+} // namespace phys_const
--- a/include/random_plugin.hh
+++ b/include/random_plugin.hh
@ -10,21 +10,21 @@
 class RNG_plugin
 {
  protected:
-    ConfigFile *pcf_; //!< pointer to config_file from which to read parameters
+    config_file *pcf_; //!< pointer to config_file from which to read parameters
  public:
-    explicit RNG_plugin(ConfigFile &cf)
+    explicit RNG_plugin(config_file &cf)
        : pcf_(&cf)
    {
    }
    virtual ~RNG_plugin() {}
    virtual bool isMultiscale() const = 0;
-    virtual void Fill_Grid( Grid_FFT<real_t>& g ) const = 0;
+    virtual void Fill_Grid( Grid_FFT<real_t>& g ) = 0;//const = 0;
    //virtual void FillGrid(int level, DensityGrid<real_t> &R) = 0;
 };

 struct RNG_plugin_creator
 {
-    virtual std::unique_ptr<RNG_plugin> Create(ConfigFile &cf) const = 0;
+    virtual std::unique_ptr<RNG_plugin> Create(config_file &cf) const = 0;
    virtual ~RNG_plugin_creator() {}
 };

@ -42,14 +42,14 @@ struct RNG_plugin_creator_concrete : public RNG_plugin_creator
    }

    //! create an instance of the plugin
-    std::unique_ptr<RNG_plugin> Create(ConfigFile &cf) const
+    std::unique_ptr<RNG_plugin> Create(config_file &cf) const
    {
        return std::make_unique<Derived>(cf);
    }
 };

 typedef RNG_plugin RNG_instance;
-std::unique_ptr<RNG_plugin> select_RNG_plugin( ConfigFile &cf);
+std::unique_ptr<RNG_plugin> select_RNG_plugin( config_file &cf);

 // /*!
 //  * @brief encapsulates all things for multi-scale white noise generation
@ -58,18 +58,18 @@ std::unique_ptr<RNG_plugin> select_RNG_plugin( ConfigFile &cf);
 // class random_number_generator
 // {
 //   protected:
-//     ConfigFile *pcf_;
+//     config_file *pcf_;
 //     //const refinement_hierarchy * prefh_;
 //     RNG_plugin *generator_;
 //     int levelmin_, levelmax_;

 //   public:
 //     //! constructor
-//     random_number_generator( ConfigFile &cf )
+//     random_number_generator( config_file &cf )
 //         : pcf_(&cf) //, prefh_( &refh )
 //     {
-//         levelmin_ = pcf_->GetValue<int>("setup", "levelmin");
-//         levelmax_ = pcf_->GetValue<int>("setup", "levelmax");
+//         levelmin_ = pcf_->get_value<int>("setup", "levelmin");
+//         levelmax_ = pcf_->get_value<int>("setup", "levelmax");
 //         generator_ = select_RNG_plugin(cf);
 //     }

--- a/include/system_stat.hh
+++ b/include/system_stat.hh
@ -1,3 +1,10 @@
+/*******************************************************************\
+ system_stat.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    08/2019 - Oliver Hahn - first implementation
+\*******************************************************************/
 #pragma once

 #include <string>
--- a/include/testing.hh
+++ b/include/testing.hh
@ -1,13 +1,21 @@
+/*******************************************************************\
+ testing.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    10/2019 - Michael Michaux & Oliver Hahn - first implementation
+\*******************************************************************/
 #pragma once

 #include <array>
 #include <general.hh>
 #include <config_file.hh>
 #include <grid_fft.hh>
+#include <cosmology_calculator.hh>

 namespace testing{
    void output_potentials_and_densities( 
-        ConfigFile& the_config,
+        config_file& the_config,
        size_t ngrid, real_t boxlen,
        Grid_FFT<real_t>& phi,
        Grid_FFT<real_t>& phi2,
@ -16,7 +24,7 @@ namespace testing{
        std::array< Grid_FFT<real_t>*,3 >& A3 );

    void output_velocity_displacement_symmetries(
-        ConfigFile &the_config,
+        config_file &the_config,
        size_t ngrid, real_t boxlen, real_t vfac, real_t dplus,
        Grid_FFT<real_t> &phi,
        Grid_FFT<real_t> &phi2,
@ -26,7 +34,8 @@ namespace testing{
        bool bwrite_out_fields=false);

    void output_convergence(
-        ConfigFile &the_config,
+        config_file &the_config,
+        cosmology::calculator* the_cosmo_calc,
        std::size_t ngrid, real_t boxlen, real_t vfac, real_t dplus,
        Grid_FFT<real_t> &phi,
        Grid_FFT<real_t> &phi2,
--- a/include/transfer_function_plugin.hh
+++ b/include/transfer_function_plugin.hh
@ -13,22 +13,29 @@ enum tf_type
    vtotal,
    vcdm,
    vbaryon,
-    total0
+    total0,
+    cdm0,
+    baryon0,
+    vtotal0,
+    vcdm0,
+    vbaryon0,
 };

 class TransferFunction_plugin
 {
  public:
    // Cosmology cosmo_;    //!< cosmological parameter, read from config_file
-    ConfigFile *pcf_;   //!< pointer to config_file from which to read parameters
+    config_file *pcf_;   //!< pointer to config_file from which to read parameters
    bool tf_distinct_;   //!< bool if density transfer function is distinct for baryons and DM
    bool tf_withvel_;    //!< bool if also have velocity transfer functions
    bool tf_withtotal0_; //!< have the z=0 spectrum for normalisation purposes
    bool tf_velunits_;   //!< velocities are in velocity units (km/s)
+    bool tf_isnormalised_; //!< assume that transfer functions come already correctly normalised and need be re-normalised to a specified value
+    
  public:
    //! constructor
-    TransferFunction_plugin(ConfigFile &cf)
-        : pcf_(&cf), tf_distinct_(false), tf_withvel_(false), tf_withtotal0_(false), tf_velunits_(false)
+    TransferFunction_plugin(config_file &cf)
+        : pcf_(&cf), tf_distinct_(false), tf_withvel_(false), tf_withtotal0_(false), tf_velunits_(false), tf_isnormalised_(false)
    { }

    //! destructor
@ -75,7 +82,7 @@ class TransferFunction_plugin
 struct TransferFunction_plugin_creator
 {
    //! create an instance of a transfer function plug-in
-    virtual std::unique_ptr<TransferFunction_plugin> create(ConfigFile &cf) const = 0;
+    virtual std::unique_ptr<TransferFunction_plugin> create(config_file &cf) const = 0;

    //! destroy an instance of a plug-in
    virtual ~TransferFunction_plugin_creator() {}
@ -96,7 +103,7 @@ struct TransferFunction_plugin_creator_concrete : public TransferFunction_plugin
    }

    //! create an instance of the plug-in
-    std::unique_ptr<TransferFunction_plugin> create(ConfigFile &cf) const
+    std::unique_ptr<TransferFunction_plugin> create(config_file &cf) const
    {
        return std::make_unique<Derived>(cf);
    }
@ -104,4 +111,4 @@ struct TransferFunction_plugin_creator_concrete : public TransferFunction_plugin

 // typedef TransferFunction_plugin TransferFunction;

-std::unique_ptr<TransferFunction_plugin> select_TransferFunction_plugin(ConfigFile &cf);
+std::unique_ptr<TransferFunction_plugin> select_TransferFunction_plugin(config_file &cf);
--- a/include/vec.hh
+++ b/include/vec.hh
@ -0,0 +1,144 @@
+#pragma once
+/*******************************************************************************\
+ vec.hh - This file is part of MUSIC2 -
+ a code to generate initial conditions for cosmological simulations 
+ 
+ CHANGELOG (only majors, for details see repo):
+    06/2019 - Oliver Hahn - first implementation
+\*******************************************************************************/
+
+#include <array>
+
+//! implements general N-dim vectors of arbitrary primtive type with some arithmetic ops
+template <int N, typename T = double>
+struct vec_t
+{
+  std::array<T, N> data_;
+
+  vec_t() {}
+
+  vec_t(const vec_t<N, T> &v)
+      : data_(v.data_) {}
+
+  vec_t(vec_t<N, T> &&v)
+      : data_(std::move(v.data_)) {}
+
+  template <typename... E>
+  vec_t(E... e)
+      : data_{{std::forward<E>(e)...}}
+  {
+    static_assert(sizeof...(E) == N, "Brace-enclosed initialiser list doesn't match vec_t length!");
+  }
+
+  //! bracket index access to vector components
+  T &operator[](size_t i) noexcept { return data_[i]; }
+
+  //! const bracket index access to vector components
+  const T &operator[](size_t i) const noexcept { return data_[i]; }
+
+  // assignment operator
+  vec_t<N, T> &operator=(const vec_t<N, T> &v) noexcept
+  {
+    data_ = v.data_;
+    return *this;
+  }
+
+  //! implementation of summation of vec_t
+  vec_t<N, T> operator+(const vec_t<N, T> &v) const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = data_[i] + v[i];
+    return res;
+  }
+
+  //! implementation of difference of vec_t
+  vec_t<N, T> operator-(const vec_t<N, T> &v) const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = data_[i] - v[i];
+    return res;
+  }
+
+  //! implementation of unary negative
+  vec_t<N, T> operator-() const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = -data_[i];
+    return res;
+  }
+
+  //! implementation of scalar multiplication
+  template <typename T2>
+  vec_t<N, T> operator*(T2 s) const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = data_[i] * s;
+    return res;
+  }
+
+  //! implementation of scalar division
+  vec_t<N, T> operator/(T s) const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = data_[i] / s;
+    return res;
+  }
+
+  //! takes the absolute value of each element
+  vec_t<N, T> abs(void) const noexcept
+  {
+    vec_t<N, T> res;
+    for (int i = 0; i < N; ++i)
+      res[i] = std::abs(data_[i]);
+    return res;
+  }
+
+  //! implementation of implicit summation of vec_t
+  vec_t<N, T> &operator+=(const vec_t<N, T> &v) noexcept
+  {
+    for (int i = 0; i < N; ++i)
+      data_[i] += v[i];
+    return *this;
+  }
+
+  //! implementation of implicit subtraction of vec_t
+  vec_t<N, T> &operator-=(const vec_t<N, T> &v) noexcept
+  {
+    for (int i = 0; i < N; ++i)
+      data_[i] -= v[i];
+    return *this;
+  }
+
+  //! implementation of implicit scalar multiplication of vec_t
+  vec_t<N, T> &operator*=(T s) noexcept
+  {
+    for (int i = 0; i < N; ++i)
+      data_[i] *= s;
+    return *this;
+  }
+
+  //! implementation of implicit scalar division of vec_t
+  vec_t<N, T> &operator/=(T s) noexcept
+  {
+    for (int i = 0; i < N; ++i)
+      data_[i] /= s;
+    return *this;
+  }
+
+  size_t size(void) const noexcept { return N; }
+};
+
+//! multiplication with scalar
+template <typename T2, int N, typename T = double>
+inline vec_t<N, T> operator*(T2 s, const vec_t<N, T> &v)
+{
+  vec_t<N, T> res;
+  for (int i = 0; i < N; ++i)
+    res[i] = v[i] * s;
+  return res;
+}
--- a/include/vec3.hh
+++ b/include/vec3.hh
@ -1,41 +0,0 @@
-#pragma once
-
-template< typename T >
-class vec3{
-private:
-    std::array<T,3> data_;
-    T &x,&y,&z;
-public:    
-    vec3()
-    : x(data_[0]),y(data_[1]),z(data_[2]){}
-
-    vec3( const vec3<T> &v)
-    : data_(v.data_), x(data_[0]),y(data_[1]),z(data_[2]){}
-
-    vec3( std::array<T,3>&& d )
-    : data_(std::move(d)), x(data_[0]),y(data_[1]),z(data_[2]){}
-    
-    vec3( vec3<T> &&v)
-    : data_(std::move(v.data_)), x(data_[0]),y(data_[1]),z(data_[2]){}
-
-    T &operator[](size_t i){ return data_[i];}
-    
-    const T &operator[](size_t i) const { return data_[i]; }
-    
-    T dot(const vec3<T> &a) const 
-    {
-        return data_[0] * a.data_[0] + data_[1] * a.data_[1] + data_[2] * a.data_[2];
-    }
-    
-    T norm_squared(void) const
-    {
-        return this->dot(*this);
-    }
-
-    T norm(void) const
-    {
-        return std::sqrt( this->norm_squared() );
-    }
-
-    
-};
--- a/new/FindFFTW3.cmake
+++ b/new/FindFFTW3.cmake
@ -1,232 +0,0 @@
-# - Try to find FFTW
-#
-# By default, it will look only for the serial libraries with single, double,
-# and long double precision. Any combination of precision (SINGLE, DOUBLE,
-# LONGDOUBLE) and library type (SERIAL, [THREADS|OPENMP], MPI) is possible by
-# using the COMPONENTS keyword. For example,
-#
-# find_package(FFTW3 COMPONENTS SINGLE DOUBLE OPENMP MPI)
-#
-# Once done this will define
-#  FFTW3_FOUND - System has FFTW3
-#  FFTW3_INCLUDE_DIRS - The FFTW3 include directories
-#  FFTW3_LIBRARIES - The libraries needed to use FFTW3
-#  FFTW3_DEFINITIONS - Compiler switches required for using FFTW3
-#  FFTW3_$KIND_$PARALLEL_FOUND- Set if FFTW3 exists in KIND precision format for PARALLEL mode.
-#                             where KIND can be: SINGLE, DOUBLE, LONGDOUBLE
-#                             and PARALLEL: SERIAL, OPENMP, MPI, THREADS.
-#  FFTW3_$KIND_$PARALLEL_LIBRARY - The libraries needed to use.
-#  FFTW3_INCLUDE_DIR_PARALLEL - The FFTW3 include directories for parallels mode.
-
-cmake_policy(SET CMP0054 NEW)
-
-if(FFTW3_FOUND)
-  return()
-endif()
-
-if(FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES)
-  set(FFTW3_FOUND TRUE)
-  foreach(component ${FFTW3_FIND_COMPONENTS})
-    if("${FFTW3_${component}_LIBRARY}" STREQUAL "")
-        set(FFTW3_${component}_LIBRARY "${FFTW3_LIBRARIES}")
-    endif()
-  endforeach()
-  return()
-endif()
-
-macro(find_specific_libraries KIND PARALLEL)
-  list(APPEND FFTW3_FIND_COMPONENTS ${KIND}_${PARALLEL})
-  if(NOT (${PARALLEL} STREQUAL "SERIAL") AND NOT ${PARALLEL}_FOUND)
-    message(FATAL_ERROR "Please, find ${PARALLEL} libraries before FFTW")
-  endif()
-
-  find_library(FFTW3_${KIND}_${PARALLEL}_LIBRARY NAMES
-    fftw3${SUFFIX_${KIND}}${SUFFIX_${PARALLEL}}${SUFFIX_FINAL} HINTS ${HINT_DIRS})
-  if(FFTW3_${KIND}_${PARALLEL}_LIBRARY MATCHES fftw3)
-    list(APPEND FFTW3_LIBRARIES ${FFTW3_${KIND}_${PARALLEL}_LIBRARY})
-    set(FFTW3_${KIND}_${PARALLEL}_FOUND TRUE)
-
-    STRING(TOLOWER "${KIND}" kind)
-    STRING(TOLOWER "${PARALLEL}" parallel)
-    if(FFTW3_${kind}_${parallel}_LIBRARY MATCHES "\\.a$")
-      add_library(fftw3::${kind}::${parallel} STATIC IMPORTED GLOBAL)
-    else()
-      add_library(fftw3::${kind}::${parallel} SHARED IMPORTED GLOBAL)
-    endif()
-
-    # MPI Has a different included library than the others
-    # FFTW3_INCLUDE_DIR_PARALLEL will change depending of which on is used.
-    set(FFTW3_INCLUDE_DIR_PARALLEL ${FFTW3_INCLUDE_DIR} )
-    if(PARALLEL STREQUAL "MPI")
-      set(FFTW3_INCLUDE_DIR_PARALLEL ${FFTW3_${PARALLEL}_INCLUDE_DIR})
-    endif()
-
-    set_target_properties(fftw3::${kind}::${parallel} PROPERTIES
-      IMPORTED_LOCATION "${FFTW3_${KIND}_${PARALLEL}_LIBRARY}"
-      INTERFACE_INCLUDE_DIRECTORIES "${FFTW3_INCLUDE_DIR_PARALLEL}")
-
-    # adding target properties to the different cases
-    ##   MPI
-    if(PARALLEL STREQUAL "MPI")
-      if(MPI_C_LIBRARIES)
-        set_target_properties(fftw3::${kind}::mpi PROPERTIES
-          IMPORTED_LOCATION "${FFTW3_${KIND}_${PARALLEL}_LIBRARY}"
-          INTERFACE_INCLUDE_DIRECTORIES "${FFTW3_INCLUDE_DIR_PARALLEL}"
-          IMPORTED_LINK_INTERFACE_LIBRARIES ${MPI_C_LIBRARIES})
-      endif()
-    endif()
-    ##   OpenMP
-    if(PARALLEL STREQUAL "OPENMP")
-      if(OPENMP_C_FLAGS)
-        set_target_properties(fftw3::${kind}::${parallel} PROPERTIES
-           IMPORTED_LOCATION "${FFTW3_${KIND}_${PARALLEL}_LIBRARY}"
-           INTERFACE_INCLUDE_DIRECTORIES "${FFTW3_INCLUDE_DIR_PARALLEL}"
-           INTERFACE_COMPILE_OPTIONS "${OPENMP_C_FLAGS}")
-        endif()
-    endif()
-    ##  THREADS
-    if(PARALLEL STREQUAL "THREADS")
-      if(CMAKE_THREAD_LIBS_INIT) # TODO: this is not running
-        set_target_properties(fftw3::${kind}::${parallel} PROPERTIES
-          IMPORTED_LOCATION "${FFTW3_${KIND}_${PARALLEL}_LIBRARY}"
-          INTERFACE_INCLUDE_DIRECTORIES "${FFTW3_INCLUDE_DIR_PARALLEL}"
-          INTERFACE_COMPILE_OPTIONS "${CMAKE_THREAD_LIBS_INIT}")
-      endif()
-    endif()
-  endif()
-endmacro()
-
-
-
-
-if(NOT FFTW3_FIND_COMPONENTS)
-  set(FFTW3_FIND_COMPONENTS SINGLE DOUBLE LONGDOUBLE SERIAL)
-endif()
-
-string(TOUPPER "${FFTW3_FIND_COMPONENTS}" FFTW3_FIND_COMPONENTS)
-
-list(FIND FFTW3_FIND_COMPONENTS SINGLE LOOK_FOR_SINGLE)
-list(FIND FFTW3_FIND_COMPONENTS DOUBLE LOOK_FOR_DOUBLE)
-list(FIND FFTW3_FIND_COMPONENTS LONGDOUBLE LOOK_FOR_LONGDOUBLE)
-list(FIND FFTW3_FIND_COMPONENTS THREADS LOOK_FOR_THREADS)
-list(FIND FFTW3_FIND_COMPONENTS OPENMP LOOK_FOR_OPENMP)
-list(FIND FFTW3_FIND_COMPONENTS MPI LOOK_FOR_MPI)
-list(FIND FFTW3_FIND_COMPONENTS SERIAL LOOK_FOR_SERIAL)
-
-# FIXME - This may fail in computers wihtout serial
-# Default serial to obtain version number
-set(LOOK_FOR_SERIAL 1)
-
-# set serial as default if none parallel component has been set
-if((LOOK_FOR_THREADS LESS 0) AND (LOOK_FOR_MPI LESS 0) AND
-    (LOOK_FOR_OPENMP LESS 0))
-  set(LOOK_FOR_SERIAL 1)
-endif()
-
-if(MPI_C_FOUND)
-  set(MPI_FOUND ${MPI_C_FOUND})
-endif()
-unset(FFTW3_FIND_COMPONENTS)
-
-
-
-
-if(WIN32)
-  set(HINT_DIRS ${FFTW3_DIRECTORY} $ENV{FFTW3_DIRECTORY})
-else()
-  find_package(PkgConfig)
-  if(PKG_CONFIG_FOUND)
-    pkg_check_modules(PC_FFTW QUIET fftw3)
-    set(FFTW3_DEFINITIONS ${PC_FFTW3_CFLAGS_OTHER})
-  endif()
-  set(HINT_DIRS ${PC_FFTW3_INCLUDEDIR} ${PC_FFTW3_INCLUDE_DIRS}
-    ${FFTW3_INCLUDE_DIR} $ENV{FFTW3_INCLUDE_DIR} )
-endif()
-
-find_path(FFTW3_INCLUDE_DIR NAMES fftw3.h HINTS ${HINT_DIRS})
-if (LOOK_FOR_MPI)  # Probably is going to be the same as fftw3.h
-  find_path(FFTW3_MPI_INCLUDE_DIR NAMES fftw3-mpi.h HINTS ${HINT_DIRS})
-endif()
-
-function(find_version OUTVAR LIBRARY SUFFIX)
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/fftw${SUFFIX}/main.c
-      # TODO: do we need to add include for mpi headers?
-      "#include <fftw3.h>
-       #include <stdio.h>
-       int main(int nargs, char const *argv[]) {
-           printf(\"%s\", fftw${SUFFIX}_version);
-           return 0;
-       }"
-  )
-if(NOT CMAKE_CROSSCOMPILING)
-    try_run(RUN_RESULT COMPILE_RESULT
-        "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/fftw${SUFFIX}/"
-        "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/fftw${SUFFIX}/main.c"
-        CMAKE_FLAGS
-          -DLINK_LIBRARIES=${LIBRARY}
-          -DINCLUDE_DIRECTORIES=${FFTW3_INCLUDE_DIR}
-        RUN_OUTPUT_VARIABLE OUTPUT
-        COMPILE_OUTPUT_VARIABLE COUTPUT
-    )
-  endif()
-  if(RUN_RESULT EQUAL 0)
-    string(REGEX REPLACE
-        ".*([0-9]+\\.[0-9]+\\.[0-9]+).*"
-        "\\1" VERSION_STRING "${OUTPUT}"
-    )
-    set(${OUTVAR} ${VERSION_STRING} PARENT_SCOPE)
-  endif()
-endfunction()
-
-set(SUFFIX_DOUBLE "")
-set(SUFFIX_SINGLE "f")
-set(SUFFIX_LONGDOUBLE "l")
-set(SUFFIX_SERIAL "")
-set(SUFFIX_OPENMP "_omp")
-set(SUFFIX_MPI "_mpi")
-set(SUFFIX_THREADS "_threads")
-set(SUFFIX_FINAL "")
-
-if(WIN32)
-  set(SUFFIX_FINAL "-3")
-else()
-  set(HINT_DIRS ${PC_FFTW3_LIBDIR} ${PC_FFTW3_LIBRARY_DIRS}
-    $ENV{FFTW3_LIBRARY_DIR} ${FFTW3_LIBRARY_DIR} )
-endif(WIN32)
-
-unset(FFTW3_LIBRARIES)
-set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR} ) # TODO what's for?
-set(FFTW3_FLAGS_C "")
-foreach(KIND SINGLE DOUBLE LONGDOUBLE)
-  if(LOOK_FOR_${KIND} LESS 0)
-    continue()
-  endif()
-  foreach(PARALLEL SERIAL MPI OPENMP THREADS)
-    if(LOOK_FOR_${PARALLEL} LESS 0)
-      continue()
-    endif()
-    find_specific_libraries(${KIND} ${PARALLEL})
-  endforeach()
-endforeach()
-
-if(FFTW3_INCLUDE_DIR)
-  list(GET FFTW3_FIND_COMPONENTS 0 smallerrun)
-  string(REPLACE "_" ";" RUNLIST ${smallerrun})
-  list(GET RUNLIST 0 KIND)
-  list(GET RUNLIST 1 PARALLEL)
-  unset(smallerrun)
-  unset(RUNLIST)
-  # suffix is quoted so it pass empty in the case of double as it's empty
-  find_version(FFTW3_VERSION_STRING ${FFTW3_${KIND}_${PARALLEL}_LIBRARY}
-    "${SUFFIX_${KIND}}")
-endif()
-
-# FIXME: fails if use REQUIRED.
-include(FindPackageHandleStandardArgs)
-# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
-# if all listed variables are TRUE
-find_package_handle_standard_args(FFTW3
-    REQUIRED_VARS FFTW3_LIBRARIES FFTW3_INCLUDE_DIR
-    VERSION_VAR FFTW3_VERSION_STRING
-    HANDLE_COMPONENTS
-)
--- a/src/grid_fft.cc
+++ b/src/grid_fft.cc
@ -2,192 +2,173 @@
 #include <grid_fft.hh>
 #include <thread>

-#include <gsl/gsl_rng.h>
-#include <gsl/gsl_randist.h>
-
-template <typename data_t>
-void Grid_FFT<data_t>::FillRandomReal(unsigned long int seed)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Setup(void)
 {
-    gsl_rng *RNG = gsl_rng_alloc(gsl_rng_mt19937);
-#if defined(USE_MPI)
-    seed += 17321 * CONFIG::MPI_task_rank;
-#endif
-    gsl_rng_set(RNG, seed);
-
-    for (size_t i = 0; i < sizes_[0]; ++i)
+    if (!bdistributed)
    {
-        for (size_t j = 0; j < sizes_[1]; ++j)
+        ntot_ = (n_[2] + 2) * n_[1] * n_[0];
+
+        music::dlog.Print("[FFT] Setting up a shared memory field %lux%lux%lu\n", n_[0], n_[1], n_[2]);
+        if (typeid(data_t) == typeid(real_t))
        {
-            for (size_t k = 0; k < sizes_[2]; ++k)
-            {
-                this->relem(i, j, k) = gsl_ran_ugaussian_ratio_method(RNG);
-            }
+            data_ = reinterpret_cast<data_t *>(fftw_malloc(ntot_ * sizeof(real_t)));
+            cdata_ = reinterpret_cast<ccomplex_t *>(data_);
+
+            plan_ = FFTW_API(plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_, FFTW_RUNMODE);
+            iplan_ = FFTW_API(plan_dft_c2r_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (real_t *)data_, FFTW_RUNMODE);
+        }
+        else if (typeid(data_t) == typeid(ccomplex_t))
+        {
+            data_ = reinterpret_cast<data_t *>(fftw_malloc(ntot_ * sizeof(ccomplex_t)));
+            cdata_ = reinterpret_cast<ccomplex_t *>(data_);
+
+            plan_ = FFTW_API(plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_, FFTW_FORWARD, FFTW_RUNMODE);
+            iplan_ = FFTW_API(plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_, FFTW_BACKWARD, FFTW_RUNMODE);
+        }
+        else
+        {
+            music::elog.Print("invalid data type in Grid_FFT<data_t>::setup_fft_interface\n");
+        }
+
+        fft_norm_fac_ = 1.0 / std::sqrt((real_t)((size_t)n_[0] * (real_t)n_[1] * (real_t)n_[2]));
+
+        if (typeid(data_t) == typeid(real_t))
+        {
+            npr_ = n_[2] + 2;
+            npc_ = n_[2] / 2 + 1;
+        }
+        else
+        {
+            npr_ = n_[2];
+            npc_ = n_[2];
+        }
+
+        for (int i = 0; i < 3; ++i)
+        {
+            nhalf_[i] = n_[i] / 2;
+            kfac_[i] = 2.0 * M_PI / length_[i];
+            kny_[i] = kfac_[i] * n_[i]/2;
+            dx_[i] = length_[i] / n_[i];
+
+            global_range_.x1_[i] = 0;
+            global_range_.x2_[i] = n_[i];
+        }
+
+        local_0_size_ = n_[0];
+        local_1_size_ = n_[1];
+        local_0_start_ = 0;
+        local_1_start_ = 0;
+
+        if (space_ == rspace_id)
+        {
+            sizes_[0] = n_[0];
+            sizes_[1] = n_[1];
+            sizes_[2] = n_[2];
+            sizes_[3] = npr_;
+        }
+        else
+        {
+            sizes_[0] = n_[1];
+            sizes_[1] = n_[0];
+            sizes_[2] = npc_;
+            sizes_[3] = npc_;
        }
    }
-
-    gsl_rng_free(RNG);
-}
-
-template <typename data_t>
-void Grid_FFT<data_t>::Setup(void)
-{
-#if !defined(USE_MPI) ////////////////////////////////////////////////////////////////////////////////////////////
-
-    ntot_ = (n_[2] + 2) * n_[1] * n_[0];
-
-    csoca::dlog.Print("[FFT] Setting up a shared memory field %lux%lux%lu\n", n_[0], n_[1], n_[2]);
-    if (typeid(data_t) == typeid(real_t))
-    {
-        data_ = reinterpret_cast<data_t *>(fftw_malloc(ntot_ * sizeof(real_t)));
-        cdata_ = reinterpret_cast<ccomplex_t *>(data_);
-
-        plan_ = FFTW_API(plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_, FFTW_RUNMODE);
-        iplan_ = FFTW_API(plan_dft_c2r_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (real_t *)data_, FFTW_RUNMODE);
-    }
-    else if (typeid(data_t) == typeid(ccomplex_t))
-    {
-        data_ = reinterpret_cast<data_t *>(fftw_malloc(ntot_ * sizeof(ccomplex_t)));
-        cdata_ = reinterpret_cast<ccomplex_t *>(data_);
-
-        plan_ = FFTW_API(plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_, FFTW_FORWARD, FFTW_RUNMODE);
-        iplan_ = FFTW_API(plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_, FFTW_BACKWARD, FFTW_RUNMODE);
-    }
    else
    {
-        csoca::elog.Print("invalid data type in Grid_FFT<data_t>::setup_fft_interface\n");
-    }
+#ifdef USE_MPI //// i.e. ifdef USE_MPI ////////////////////////////////////////////////////////////////////////////////////
+        size_t cmplxsz;

-    fft_norm_fac_ = 1.0 / std::sqrt((double)((size_t)n_[0] * (double)n_[1] * (double)n_[2]));
+        if (typeid(data_t) == typeid(real_t))
+        {
+            cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2] / 2 + 1, MPI_COMM_WORLD,
+                                                             &local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
+            ntot_ = 2 * cmplxsz;
+            data_ = (data_t *)fftw_malloc(ntot_ * sizeof(real_t));
+            cdata_ = reinterpret_cast<ccomplex_t *>(data_);
+            plan_ = FFTW_API(mpi_plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_,
+                                                  MPI_COMM_WORLD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_OUT);
+            iplan_ = FFTW_API(mpi_plan_dft_c2r_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (real_t *)data_,
+                                                   MPI_COMM_WORLD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_IN);
+        }
+        else if (typeid(data_t) == typeid(ccomplex_t))
+        {
+            cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2], MPI_COMM_WORLD,
+                                                             &local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
+            ntot_ = cmplxsz;
+            data_ = (data_t *)fftw_malloc(ntot_ * sizeof(ccomplex_t));
+            cdata_ = reinterpret_cast<ccomplex_t *>(data_);
+            plan_ = FFTW_API(mpi_plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_,
+                                              MPI_COMM_WORLD, FFTW_FORWARD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_OUT);
+            iplan_ = FFTW_API(mpi_plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_,
+                                               MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_IN);
+        }
+        else
+        {
+            music::elog.Print("unknown data type in Grid_FFT<data_t>::setup_fft_interface\n");
+            abort();
+        }

-    if (typeid(data_t) == typeid(real_t))
-    {
-        npr_ = n_[2] + 2;
-        npc_ = n_[2] / 2 + 1;
-    }
-    else
-    {
-        npr_ = n_[2];
-        npc_ = n_[2];
-    }
+        music::dlog.Print("[FFT] Setting up a distributed memory field %lux%lux%lu\n", n_[0], n_[1], n_[2]);

-    for (int i = 0; i < 3; ++i)
-    {
-        nhalf_[i] = n_[i] / 2;
-        kfac_[i] = 2.0 * M_PI / length_[i];
-        dx_[i] = length_[i] / n_[i];
+        fft_norm_fac_ = 1.0 / sqrt((real_t)n_[0] * (real_t)n_[1] * (real_t)n_[2]);

-        global_range_.x1_[i] = 0;
-        global_range_.x2_[i] = n_[i];
-    }
+        if (typeid(data_t) == typeid(real_t))
+        {
+            npr_ = n_[2] + 2;
+            npc_ = n_[2] / 2 + 1;
+        }
+        else
+        {
+            npr_ = n_[2];
+            npc_ = n_[2];
+        }

-    local_0_size_ = n_[0];
-    local_1_size_ = n_[1];
-    local_0_start_ = 0;
-    local_1_start_ = 0;
+        for (int i = 0; i < 3; ++i)
+        {
+            nhalf_[i] = n_[i] / 2;
+            kfac_[i] = 2.0 * M_PI / length_[i];
+            kny_[i] = kfac_[i] * n_[i]/2;
+            dx_[i] = length_[i] / n_[i];

-    if (space_ == rspace_id)
-    {
-        sizes_[0] = n_[0];
-        sizes_[1] = n_[1];
-        sizes_[2] = n_[2];
-        sizes_[3] = npr_;
-    }
-    else
-    {
-        sizes_[0] = n_[1];
-        sizes_[1] = n_[0];
-        sizes_[2] = npc_;
-        sizes_[3] = npc_;
-    }
-
-#else //// i.e. ifdef USE_MPI ////////////////////////////////////////////////////////////////////////////////////
-
-    size_t cmplxsz;
-
-    if (typeid(data_t) == typeid(real_t))
-    {
-        cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2] / 2 + 1, MPI_COMM_WORLD,
-                                                         &local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
-        ntot_ = 2 * cmplxsz;
-        data_ = (data_t *)fftw_malloc(ntot_ * sizeof(real_t));
-        cdata_ = reinterpret_cast<ccomplex_t *>(data_);
-        plan_ = FFTW_API(mpi_plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_,
-                                              MPI_COMM_WORLD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_OUT);
-        iplan_ = FFTW_API(mpi_plan_dft_c2r_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (real_t *)data_,
-                                               MPI_COMM_WORLD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_IN);
-    }
-    else if (typeid(data_t) == typeid(ccomplex_t))
-    {
-        cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2], MPI_COMM_WORLD,
-                                                         &local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
-        ntot_ = cmplxsz;
-        data_ = (data_t *)fftw_malloc(ntot_ * sizeof(ccomplex_t));
-        cdata_ = reinterpret_cast<ccomplex_t *>(data_);
-        plan_ = FFTW_API(mpi_plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_,
-                                          MPI_COMM_WORLD, FFTW_FORWARD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_OUT);
-        iplan_ = FFTW_API(mpi_plan_dft_3d)(n_[0], n_[1], n_[2], (complex_t *)data_, (complex_t *)data_,
-                                           MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_RUNMODE | FFTW_MPI_TRANSPOSED_IN);
-    }
-    else
-    {
-        csoca::elog.Print("unknown data type in Grid_FFT<data_t>::setup_fft_interface\n");
-        abort();
-    }
-
-    csoca::dlog.Print("[FFT] Setting up a distributed memory field %lux%lux%lu\n", n_[0], n_[1], n_[2]);
-    fft_norm_fac_ = 1.0 / sqrt((double)n_[0] * (double)n_[1] * (double)n_[2]);
-
-    if (typeid(data_t) == typeid(real_t))
-    {
-        npr_ = n_[2] + 2;
-        npc_ = n_[2] / 2 + 1;
-    }
-    else
-    {
-        npr_ = n_[2];
-        npc_ = n_[2];
-    }
-
-    for (int i = 0; i < 3; ++i)
-    {
-        nhalf_[i] = n_[i] / 2;
-        kfac_[i] = 2.0 * M_PI / length_[i];
-        dx_[i] = length_[i] / n_[i];
-
-        global_range_.x1_[i] = 0;
-        global_range_.x2_[i] = n_[i];
-    }
-    global_range_.x1_[0] = (int)local_0_start_;
-    global_range_.x2_[0] = (int)(local_0_start_ + local_0_size_);
-
-    if (space_ == rspace_id)
-    {
-        sizes_[0] = (int)local_0_size_;
-        sizes_[1] = n_[1];
-        sizes_[2] = n_[2];
-        sizes_[3] = npr_; // holds the physical memory size along the 3rd dimension
-    }
-    else
-    {
-        sizes_[0] = (int)local_1_size_;
-        sizes_[1] = n_[0];
-        sizes_[2] = npc_;
-        sizes_[3] = npc_; // holds the physical memory size along the 3rd dimension
-    }
+            global_range_.x1_[i] = 0;
+            global_range_.x2_[i] = n_[i];
+        }
+        global_range_.x1_[0] = (int)local_0_start_;
+        global_range_.x2_[0] = (int)(local_0_start_ + local_0_size_);

+        if (space_ == rspace_id)
+        {
+            sizes_[0] = (int)local_0_size_;
+            sizes_[1] = n_[1];
+            sizes_[2] = n_[2];
+            sizes_[3] = npr_; // holds the physical memory size along the 3rd dimension
+        }
+        else
+        {
+            sizes_[0] = (int)local_1_size_;
+            sizes_[1] = n_[0];
+            sizes_[2] = npc_;
+            sizes_[3] = npc_; // holds the physical memory size along the 3rd dimension
+        }
+#else
+        music::flog << "MPI is required for distributed FFT arrays!" << std::endl;
+        throw std::runtime_error("MPI is required for distributed FFT arrays!");
 #endif //// of #ifdef #else USE_MPI ////////////////////////////////////////////////////////////////////////////////////
+    }
 }

-template <typename data_t>
-void Grid_FFT<data_t>::ApplyNorm(void)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::ApplyNorm(void)
 {
 #pragma omp parallel for
    for (size_t i = 0; i < ntot_; ++i)
        data_[i] *= fft_norm_fac_;
 }

-template <typename data_t>
-void Grid_FFT<data_t>::FourierTransformForward(bool do_transform)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
 {
 #if defined(USE_MPI)
    MPI_Barrier(MPI_COMM_WORLD);
@ -199,12 +180,13 @@ void Grid_FFT<data_t>::FourierTransformForward(bool do_transform)
        if (do_transform)
        {
            double wtime = get_wtime();
-            csoca::dlog.Print("[FFT] Calling Grid_FFT::to_kspace (%lux%lux%lu)", sizes_[0], sizes_[1], sizes_[2]);
-            FFTW_API(execute)(plan_);
+            music::dlog.Print("[FFT] Calling Grid_FFT::to_kspace (%lux%lux%lu)", sizes_[0], sizes_[1], sizes_[2]);
+            FFTW_API(execute)
+            (plan_);
            this->ApplyNorm();

            wtime = get_wtime() - wtime;
-            csoca::dlog.Print("[FFT] Completed Grid_FFT::to_kspace (%lux%lux%lu), took %f s", sizes_[0], sizes_[1], sizes_[2], wtime);
+            music::dlog.Print("[FFT] Completed Grid_FFT::to_kspace (%lux%lux%lu), took %f s", sizes_[0], sizes_[1], sizes_[2], wtime);
        }

        sizes_[0] = local_1_size_;
@ -217,8 +199,8 @@ void Grid_FFT<data_t>::FourierTransformForward(bool do_transform)
    }
 }

-template <typename data_t>
-void Grid_FFT<data_t>::FourierTransformBackward(bool do_transform)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
 {
 #if defined(USE_MPI)
    MPI_Barrier(MPI_COMM_WORLD);
@ -229,14 +211,14 @@ void Grid_FFT<data_t>::FourierTransformBackward(bool do_transform)
        //.............................
        if (do_transform)
        {
-            csoca::dlog.Print("[FFT] Calling Grid_FFT::to_rspace (%dx%dx%d)\n", sizes_[0], sizes_[1], sizes_[2]);
+            music::dlog.Print("[FFT] Calling Grid_FFT::to_rspace (%dx%dx%d)\n", sizes_[0], sizes_[1], sizes_[2]);
            double wtime = get_wtime();

            FFTW_API(execute)(iplan_);
            this->ApplyNorm();

            wtime = get_wtime() - wtime;
-            csoca::dlog.Print("[FFT] Completed Grid_FFT::to_rspace (%dx%dx%d), took %f s\n", sizes_[0], sizes_[1], sizes_[2], wtime);
+            music::dlog.Print("[FFT] Completed Grid_FFT::to_rspace (%dx%dx%d), took %f s\n", sizes_[0], sizes_[1], sizes_[2], wtime);
        }
        sizes_[0] = local_0_size_;
        sizes_[1] = n_[1];
@ -269,9 +251,293 @@ void create_hdf5(std::string Filename)
    H5Fclose(HDF_FileID);
 }

-template <typename data_t>
-void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname) const
+template <typename T>
+hid_t hdf5_get_data_type(void)
 {
+    if (typeid(T) == typeid(int))
+        return H5T_NATIVE_INT;
+
+    if (typeid(T) == typeid(unsigned))
+        return H5T_NATIVE_UINT;
+
+    if (typeid(T) == typeid(float))
+        return H5T_NATIVE_FLOAT;
+
+    if (typeid(T) == typeid(double))
+        return H5T_NATIVE_DOUBLE;
+    
+    if (typeid(T) == typeid(long double))
+        return H5T_NATIVE_LDOUBLE;
+
+    if (typeid(T) == typeid(long long))
+        return H5T_NATIVE_LLONG;
+
+    if (typeid(T) == typeid(unsigned long long))
+        return H5T_NATIVE_ULLONG;
+
+    if (typeid(T) == typeid(size_t))
+        return H5T_NATIVE_ULLONG;
+
+    music::elog << "[HDF_IO] trying to evaluate unsupported type in GetDataType";
+    return -1;
+}
+
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Read_from_HDF5(const std::string Filename, const std::string ObjName)
+{
+    if (bdistributed)
+    {
+        music::elog << "Attempt to read from HDF5 into MPI-distributed array. This is not supported yet!" << std::endl;
+        abort();
+    }
+
+    hid_t HDF_Type = hdf5_get_data_type<data_t>();
+
+    hid_t HDF_FileID = H5Fopen(Filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+
+    //... save old error handler
+    herr_t (*old_func)(void *);
+    void *old_client_data;
+
+    H5Eget_auto(&old_func, &old_client_data);
+
+    //... turn off error handling by hdf5 library
+    H5Eset_auto(NULL, NULL);
+
+    //... probe dataset opening
+    hid_t HDF_DatasetID = H5Dopen(HDF_FileID, ObjName.c_str());
+
+    //... restore previous error handler
+    H5Eset_auto(old_func, old_client_data);
+
+    //... dataset did not exist or was empty
+    if (HDF_DatasetID < 0)
+    {
+        music::elog << "Dataset \'" << ObjName.c_str() << "\' does not exist or is empty." << std::endl;
+        H5Fclose(HDF_FileID);
+        abort();
+    }
+
+    //... get space associated with dataset and its extensions
+    hid_t HDF_DataspaceID = H5Dget_space(HDF_DatasetID);
+
+    int ndims = H5Sget_simple_extent_ndims(HDF_DataspaceID);
+
+    hsize_t dimsize[3];
+
+    H5Sget_simple_extent_dims(HDF_DataspaceID, dimsize, NULL);
+
+    hsize_t HDF_StorageSize = 1;
+    for (int i = 0; i < ndims; ++i)
+        HDF_StorageSize *= dimsize[i];
+
+    //... adjust the array size to hold the data
+    std::vector<data_t> Data;
+    Data.reserve(HDF_StorageSize);
+    Data.assign(HDF_StorageSize, (data_t)0);
+
+    if (Data.capacity() < HDF_StorageSize)
+    {
+        music::elog << "Not enough memory to store all data in HDFReadDataset!" << std::endl;
+        H5Sclose(HDF_DataspaceID);
+        H5Dclose(HDF_DatasetID);
+        H5Fclose(HDF_FileID);
+        abort();
+    }
+
+    //... read the dataset
+    H5Dread(HDF_DatasetID, HDF_Type, H5S_ALL, H5S_ALL, H5P_DEFAULT, &Data[0]);
+
+    if (Data.size() != HDF_StorageSize)
+    {
+        music::elog << "Something went wrong while reading!" << std::endl;
+        H5Sclose(HDF_DataspaceID);
+        H5Dclose(HDF_DatasetID);
+        H5Fclose(HDF_FileID);
+        abort();
+    }
+
+    H5Sclose(HDF_DataspaceID);
+    H5Dclose(HDF_DatasetID);
+    H5Fclose(HDF_FileID);
+
+    assert(dimsize[0] == dimsize[1] && dimsize[0] == dimsize[2]);
+    music::ilog << "Read external constraint data of dimensions " << dimsize[0] << "**3." << std::endl;
+
+    for (size_t i = 0; i < 3; ++i)
+        this->n_[i] = dimsize[i];
+    this->space_ = rspace_id;
+
+    if (data_ != nullptr)
+    {
+        fftw_free(data_);
+    }
+    this->Setup();
+
+    //... copy data to internal array ...
+    real_t sum1{0.0}, sum2{0.0};
+    #pragma omp parallel for reduction(+ : sum1, sum2)
+    for (size_t i = 0; i < size(0); ++i)
+    {
+        for (size_t j = 0; j < size(1); ++j)
+        {
+            for (size_t k = 0; k < size(2); ++k)
+            {
+                this->relem(i, j, k) = Data[(i * size(1) + j) * size(2) + k];
+                sum2 += std::real(this->relem(i, j, k) * this->relem(i, j, k));
+                sum1 += std::real(this->relem(i, j, k));
+            }
+        }
+    }
+    sum1 /= Data.size();
+    sum2 /= Data.size();
+    auto stdw = std::sqrt(sum2 - sum1 * sum1);
+    music::ilog << "Constraint field has <W>=" << sum1 << ", <W^2>-<W>^2=" << stdw << std::endl;
+
+    #pragma omp parallel for reduction(+ : sum1, sum2)
+    for (size_t i = 0; i < size(0); ++i)
+    {
+        for (size_t j = 0; j < size(1); ++j)
+        {
+            for (size_t k = 0; k < size(2); ++k)
+            {
+                this->relem(i, j, k) /= stdw;
+            }
+        }
+    }
+}
+
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::string datasetname) const
+{
+    // FIXME: cleanup duplicate code in this function!
+    if (!bdistributed && CONFIG::MPI_task_rank == 0)
+    {
+
+        hid_t file_id, dset_id;    /* file and dataset identifiers */
+        hid_t filespace, memspace; /* file and memory dataspace identifiers */
+        hsize_t offset[3], count[3];
+        hid_t dtype_id = H5T_NATIVE_FLOAT;
+        hid_t plist_id = H5P_DEFAULT;
+
+        if (!file_exists(fname))
+            create_hdf5(fname);
+
+        file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id);
+
+        for (int i = 0; i < 3; ++i)
+            count[i] = size(i);
+
+        if (typeid(data_t) == typeid(float))
+            dtype_id = H5T_NATIVE_FLOAT;
+        else if (typeid(data_t) == typeid(double))
+            dtype_id = H5T_NATIVE_DOUBLE;
+        else if (typeid(data_t) == typeid(long double))
+            dtype_id = H5T_NATIVE_LDOUBLE;    
+        else if (typeid(data_t) == typeid(std::complex<float>))
+            dtype_id = H5T_NATIVE_FLOAT;
+        else if (typeid(data_t) == typeid(std::complex<double>))
+            dtype_id = H5T_NATIVE_DOUBLE;
+        else if (typeid(data_t) == typeid(std::complex<long double>))
+            dtype_id = H5T_NATIVE_LDOUBLE;
+
+        filespace = H5Screate_simple(3, count, NULL);
+        dset_id = H5Dcreate2(file_id, datasetname.c_str(), dtype_id, filespace,
+                             H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Sclose(filespace);
+
+        hsize_t slice_sz = size(1) * size(2);
+
+        real_t *buf = new real_t[slice_sz];
+
+        count[0] = 1;
+        count[1] = size(1);
+        count[2] = size(2);
+
+        offset[1] = 0;
+        offset[2] = 0;
+
+        memspace = H5Screate_simple(3, count, NULL);
+        filespace = H5Dget_space(dset_id);
+
+        for (size_t i = 0; i < size(0); ++i)
+        {
+            offset[0] = i;
+            for (size_t j = 0; j < size(1); ++j)
+            {
+                for (size_t k = 0; k < size(2); ++k)
+                {
+                    if (this->space_ == rspace_id)
+                        buf[j * size(2) + k] = std::real(relem(i, j, k));
+                    else
+                        buf[j * size(2) + k] = std::real(kelem(i, j, k));
+                }
+            }
+
+            H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
+            H5Dwrite(dset_id, dtype_id, memspace, filespace, H5P_DEFAULT, buf);
+        }
+
+        H5Sclose(filespace);
+        H5Sclose(memspace);
+
+        // H5Sclose(filespace);
+        H5Dclose(dset_id);
+
+        if (typeid(data_t) == typeid(std::complex<float>) ||
+            typeid(data_t) == typeid(std::complex<double>) ||
+            typeid(data_t) == typeid(std::complex<long double>) ||
+            this->space_ == kspace_id)
+        {
+            datasetname += std::string(".im");
+
+            for (int i = 0; i < 3; ++i)
+                count[i] = size(i);
+
+            filespace = H5Screate_simple(3, count, NULL);
+            dset_id = H5Dcreate2(file_id, datasetname.c_str(), dtype_id, filespace,
+                                 H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            H5Sclose(filespace);
+
+            count[0] = 1;
+
+            for (size_t i = 0; i < size(0); ++i)
+            {
+                offset[0] = i;
+
+                for (size_t j = 0; j < size(1); ++j)
+                    for (size_t k = 0; k < size(2); ++k)
+                    {
+                        if (this->space_ == rspace_id)
+                            buf[j * size(2) + k] = std::imag(relem(i, j, k));
+                        else
+                            buf[j * size(2) + k] = std::imag(kelem(i, j, k));
+                    }
+
+                memspace = H5Screate_simple(3, count, NULL);
+                filespace = H5Dget_space(dset_id);
+
+                H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count,
+                                    NULL);
+
+                H5Dwrite(dset_id, dtype_id, memspace, filespace, H5P_DEFAULT, buf);
+
+                H5Sclose(memspace);
+                H5Sclose(filespace);
+            }
+
+            H5Dclose(dset_id);
+
+            delete[] buf;
+        }
+
+        H5Fclose(file_id);
+        return;
+    }
+
+    if (!bdistributed && CONFIG::MPI_task_rank != 0)
+        return;
+
    hid_t file_id, dset_id;    /* file and dataset identifiers */
    hid_t filespace, memspace; /* file and memory dataspace identifiers */
    hsize_t offset[3], count[3];
@ -282,8 +548,8 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)

    int mpi_size, mpi_rank;

-    mpi_size = MPI_Get_size();
-    mpi_rank = MPI_Get_rank();
+    mpi_size = MPI::get_size();
+    mpi_rank = MPI::get_rank();

    if (!file_exists(fname) && mpi_rank == 0)
        create_hdf5(fname);
@ -329,14 +595,14 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)
            dtype_id = H5T_NATIVE_FLOAT;
        else if (typeid(data_t) == typeid(double))
            dtype_id = H5T_NATIVE_DOUBLE;
+        else if (typeid(data_t) == typeid(long double))
+            dtype_id = H5T_NATIVE_LDOUBLE;
        else if (typeid(data_t) == typeid(std::complex<float>))
-        {
            dtype_id = H5T_NATIVE_FLOAT;
-        }
        else if (typeid(data_t) == typeid(std::complex<double>))
-        {
            dtype_id = H5T_NATIVE_DOUBLE;
-        }
+        else if (typeid(data_t) == typeid(std::complex<long double>))
+            dtype_id = H5T_NATIVE_LDOUBLE;

 #if defined(USE_MPI) && !defined(USE_MPI_IO)
        if (itask == 0)
@ -391,7 +657,10 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)
            {
                for (size_t k = 0; k < size(2); ++k)
                {
-                    buf[j * size(2) + k] = std::real(relem(i, j, k));
+                    if (this->space_ == rspace_id)
+                        buf[j * size(2) + k] = std::real(relem(i, j, k));
+                    else
+                        buf[j * size(2) + k] = std::real(kelem(i, j, k));
                }
            }

@ -410,7 +679,9 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)
        H5Dclose(dset_id);

        if (typeid(data_t) == typeid(std::complex<float>) ||
-            typeid(data_t) == typeid(std::complex<double>))
+            typeid(data_t) == typeid(std::complex<double>) ||
+            typeid(data_t) == typeid(std::complex<long double>) ||
+            this->space_ == kspace_id)
        {
            datasetname += std::string(".im");

@ -460,7 +731,10 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)
                for (size_t j = 0; j < size(1); ++j)
                    for (size_t k = 0; k < size(2); ++k)
                    {
-                        buf[j * size(2) + k] = std::imag(relem(i, j, k));
+                        if (this->space_ == rspace_id)
+                            buf[j * size(2) + k] = std::imag(relem(i, j, k));
+                        else
+                            buf[j * size(2) + k] = std::imag(kelem(i, j, k));
                    }

                memspace = H5Screate_simple(3, count, NULL);
@ -493,8 +767,8 @@ void Grid_FFT<data_t>::Write_to_HDF5(std::string fname, std::string datasetname)

 #include <iomanip>

-template <typename data_t>
-void Grid_FFT<data_t>::Write_PDF(std::string ofname, int nbins, double scale, double vmin, double vmax)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Write_PDF(std::string ofname, int nbins, double scale, double vmin, double vmax)
 {
    double logvmin = std::log10(vmin);
    double logvmax = std::log10(vmax);
@ -545,13 +819,12 @@ void Grid_FFT<data_t>::Write_PDF(std::string ofname, int nbins, double scale, do
 #endif
 }

-template <typename data_t>
-void Grid_FFT<data_t>::Write_PowerSpectrum(std::string ofname)
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Write_PowerSpectrum(std::string ofname)
 {
    std::vector<double> bin_k, bin_P, bin_eP;
    std::vector<size_t> bin_count;
-    int nbins = 4 * std::max(nhalf_[0], std::max(nhalf_[1], nhalf_[2]));
-    this->Compute_PowerSpectrum(bin_k, bin_P, bin_eP, bin_count );
+    this->Compute_PowerSpectrum(bin_k, bin_P, bin_eP, bin_count);
 #if defined(USE_MPI)
    if (CONFIG::MPI_task_rank == 0)
    {
@ -576,8 +849,8 @@ void Grid_FFT<data_t>::Write_PowerSpectrum(std::string ofname)
 #endif
 }

-template <typename data_t>
-void Grid_FFT<data_t>::Compute_PowerSpectrum(std::vector<double> &bin_k, std::vector<double> &bin_P, std::vector<double> &bin_eP, std::vector<size_t> &bin_count )
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::Compute_PowerSpectrum(std::vector<double> &bin_k, std::vector<double> &bin_P, std::vector<double> &bin_eP, std::vector<size_t> &bin_count)
 {
    this->FourierTransformForward();

@ -597,7 +870,7 @@ void Grid_FFT<data_t>::Compute_PowerSpectrum(std::vector<double> &bin_k, std::ve
        for (size_t iy = 0; iy < size(1); iy++)
            for (size_t iz = 0; iz < size(2); iz++)
            {
-                vec3<double> k3 = get_k<double>(ix, iy, iz);
+                vec3_t<double> k3 = get_k<double>(ix, iy, iz);
                double k = k3.norm();
                int idx2 = k / dk; //int((1.0f / dklog * std::log10(k / kmin)));
                auto z = this->kelem(ix, iy, iz);
@ -657,5 +930,7 @@ void Grid_FFT<data_t>::Compute_PowerSpectrum(std::vector<double> &bin_k, std::ve

 /********************************************************************************************/

-template class Grid_FFT<real_t>;
-template class Grid_FFT<ccomplex_t>;
+template class Grid_FFT<real_t, true>;
+template class Grid_FFT<real_t, false>;
+template class Grid_FFT<ccomplex_t, true>;
+template class Grid_FFT<ccomplex_t, false>;
--- a/src/ic_generator.cc
+++ b/src/ic_generator.cc
@ -7,6 +7,7 @@

 #include <ic_generator.hh>
 #include <particle_generator.hh>
+#include <particle_plt.hh>

 #include <unistd.h> // for unlink

@ -21,18 +22,18 @@ namespace ic_generator{

 std::unique_ptr<RNG_plugin> the_random_number_generator;
 std::unique_ptr<output_plugin> the_output_plugin;
-std::unique_ptr<CosmologyCalculator>  the_cosmo_calc;
+std::unique_ptr<cosmology::calculator>  the_cosmo_calc;

-int Initialise( ConfigFile& the_config )
+int Initialise( config_file& the_config )
 {
    the_random_number_generator = std::move(select_RNG_plugin(the_config));
    the_output_plugin           = std::move(select_output_plugin(the_config));
-    the_cosmo_calc              = std::make_unique<CosmologyCalculator>(the_config);
+    the_cosmo_calc              = std::make_unique<cosmology::calculator>(the_config);

    return 0;
 }

-int Run( ConfigFile& the_config )
+int Run( config_file& the_config )
 {
    //--------------------------------------------------------------------------------------------------------
    // Read run parameters
@ -40,56 +41,75 @@ int Run( ConfigFile& the_config )

    //--------------------------------------------------------------------------------------------------------
    //! number of resolution elements per dimension
-    const size_t ngrid = the_config.GetValue<size_t>("setup", "GridRes");
+    const size_t ngrid = the_config.get_value<size_t>("setup", "GridRes");

    //--------------------------------------------------------------------------------------------------------
    //! box side length in h-1 Mpc
-    const real_t boxlen = the_config.GetValue<double>("setup", "BoxLength");
+    const real_t boxlen = the_config.get_value<double>("setup", "BoxLength");

    //--------------------------------------------------------------------------------------------------------
    //! starting redshift
-    const real_t zstart = the_config.GetValue<double>("setup", "zstart");
+    const real_t zstart = the_config.get_value<double>("setup", "zstart");

    //--------------------------------------------------------------------------------------------------------
    //! order of the LPT approximation 
-    int LPTorder = the_config.GetValueSafe<double>("setup","LPTorder",100);
+    int LPTorder = the_config.get_value_safe<double>("setup","LPTorder",100);

    //--------------------------------------------------------------------------------------------------------
    //! initialice particles on a bcc or fcc lattice instead of a standard sc lattice (doubles and quadruples the number of particles) 
-    std::string lattice_str = the_config.GetValueSafe<std::string>("setup","ParticleLoad","sc");
-    const particle::lattice lattice_type = (lattice_str=="bcc")? particle::lattice_bcc 
-        : ((lattice_str=="fcc")? particle::lattice_fcc : particle::lattice_sc);
+    std::string lattice_str = the_config.get_value_safe<std::string>("setup","ParticleLoad","sc");
+    const particle::lattice lattice_type = 
+          ((lattice_str=="bcc")? particle::lattice_bcc 
+        : ((lattice_str=="fcc")? particle::lattice_fcc 
+        : ((lattice_str=="rsc")? particle::lattice_rsc 
+        : ((lattice_str=="glass")? particle::lattice_glass
+        : particle::lattice_sc))));

    //--------------------------------------------------------------------------------------------------------
    //! apply fixing of the complex mode amplitude following Angulo & Pontzen (2016) [https://arxiv.org/abs/1603.05253]
-    const bool bDoFixing = the_config.GetValueSafe<bool>("setup", "DoFixing", false);
+    const bool bDoFixing = the_config.get_value_safe<bool>("setup", "DoFixing", false);

    //--------------------------------------------------------------------------------------------------------
    //! do baryon ICs?
-    const bool bDoBaryons = the_config.GetValueSafe<bool>("setup", "DoBaryons", false );
+    const bool bDoBaryons = the_config.get_value_safe<bool>("setup", "DoBaryons", false );
+    std::map< cosmo_species, double > Omega;
+    if( bDoBaryons ){
+        double Om = the_config.get_value<double>("cosmology", "Omega_m");
+        double Ob = the_config.get_value<double>("cosmology", "Omega_b");
+        Omega[cosmo_species::dm] = Om-Ob;
+        Omega[cosmo_species::baryon] = Ob;
+    }else{
+        double Om = the_config.get_value<double>("cosmology", "Omega_m");
+        Omega[cosmo_species::dm] = Om;
+        Omega[cosmo_species::baryon] = 0.0;
+    }
+
+    //--------------------------------------------------------------------------------------------------------
+    //! do constrained ICs?
+    const bool bAddConstrainedModes =  the_config.contains_key("setup", "ConstraintFieldFile" );

    //--------------------------------------------------------------------------------------------------------
    //! add beyond box tidal field modes following Schmidt et al. (2018) [https://arxiv.org/abs/1803.03274]
-    bool bAddExternalTides = the_config.ContainsKey("cosmology", "LSS_aniso_lx") 
-                           & the_config.ContainsKey("cosmology", "LSS_aniso_ly") 
-                           & the_config.ContainsKey("cosmology", "LSS_aniso_lz");
+    bool bAddExternalTides = the_config.contains_key("cosmology", "LSS_aniso_lx") 
+                           & the_config.contains_key("cosmology", "LSS_aniso_ly") 
+                           & the_config.contains_key("cosmology", "LSS_aniso_lz");

-    if( bAddExternalTides && !(  the_config.ContainsKey("cosmology", "LSS_aniso_lx") 
-                               | the_config.ContainsKey("cosmology", "LSS_aniso_ly") 
-                               | the_config.ContainsKey("cosmology", "LSS_aniso_lz") ))
+    if( bAddExternalTides && !(  the_config.contains_key("cosmology", "LSS_aniso_lx") 
+                               | the_config.contains_key("cosmology", "LSS_aniso_ly") 
+                               | the_config.contains_key("cosmology", "LSS_aniso_lz") ))
    {
-        csoca::elog << "Not all dimensions of LSS_aniso_l{x,y,z} specified! Will ignore external tidal field!" << std::endl;
+        music::elog << "Not all dimensions of LSS_aniso_l{x,y,z} specified! Will ignore external tidal field!" << std::endl;
        bAddExternalTides = false;
    }
    // Anisotropy parameters for beyond box tidal field 
    std::array<real_t,3> lss_aniso_lambda = {
-        the_config.GetValueSafe<double>("cosmology", "LSS_aniso_lx", 0.0),
-        the_config.GetValueSafe<double>("cosmology", "LSS_aniso_ly", 0.0),
-        the_config.GetValueSafe<double>("cosmology", "LSS_aniso_lz", 0.0),
+        the_config.get_value_safe<double>("cosmology", "LSS_aniso_lx", 0.0),
+        the_config.get_value_safe<double>("cosmology", "LSS_aniso_ly", 0.0),
+        the_config.get_value_safe<double>("cosmology", "LSS_aniso_lz", 0.0),
    };  
    
    if( std::abs(lss_aniso_lambda[0]+lss_aniso_lambda[1]+lss_aniso_lambda[2]) > 1e-10 ){
-        csoca::elog << "External tidal field is not trace-free! Will subtract trace!" << std::endl;
+        music::elog << "External tidal field is not trace-free! Will subtract trace!" << std::endl;
        auto tr_l_3 = (lss_aniso_lambda[0]+lss_aniso_lambda[1]+lss_aniso_lambda[2])/3.0;
        lss_aniso_lambda[0] -= tr_l_3;
        lss_aniso_lambda[1] -= tr_l_3;
@ -101,20 +121,20 @@ int Run( ConfigFile& the_config )
    const real_t astart = 1.0/(1.0+zstart);
    const real_t volfac(std::pow(boxlen / ngrid / 2.0 / M_PI, 1.5));

-    the_cosmo_calc->WritePowerspectrum(astart, "input_powerspec.txt" );
+    the_cosmo_calc->write_powerspectrum(astart, "input_powerspec.txt" );

-    //csoca::ilog << "-----------------------------------------------------------------------------" << std::endl;
+    //music::ilog << "-----------------------------------------------------------------------------" << std::endl;

    // if( bSymplecticPT && LPTorder!=2 ){
-    //     csoca::wlog << "SymplecticPT has been selected and will overwrite chosen order of LPT to 2" << std::endl;
+    //     music::wlog << "SymplecticPT has been selected and will overwrite chosen order of LPT to 2" << std::endl;
    //     LPTorder = 2;
    // }

    //--------------------------------------------------------------------
    // Compute LPT time coefficients
    //--------------------------------------------------------------------
-    const real_t Dplus0 = the_cosmo_calc->CalcGrowthFactor(astart) / the_cosmo_calc->CalcGrowthFactor(1.0);
-    const real_t vfac   = the_cosmo_calc->CalcVFact(astart);
+    const real_t Dplus0 = the_cosmo_calc->get_growth_factor(astart);
+    const real_t vfac   = the_cosmo_calc->get_vfact(astart);

    const double g1  = -Dplus0;
    const double g2  = ((LPTorder>1)? -3.0/7.0*Dplus0*Dplus0 : 0.0);
@ -132,7 +152,7 @@ int Run( ConfigFile& the_config )
    // coefficients needed for anisotropic external tides
    const double ai3 = std::pow(astart,-3);
    const double Omega_m_of_a = the_cosmo_calc->cosmo_param_.Omega_m * ai3 / (the_cosmo_calc->cosmo_param_.Omega_m * ai3 + the_cosmo_calc->cosmo_param_.Omega_DE);
-    const double f1 = the_cosmo_calc->CalcGrowthRate(astart);
+    const double f1 = the_cosmo_calc->get_f(astart);
    const double f_aniso = -4.0/3.0 * f1 * f1 / Omega_m_of_a;

    const std::array<real_t,3> lss_aniso_alpha = {
@ -151,200 +171,300 @@ int Run( ConfigFile& the_config )
    Grid_FFT<real_t> A3x({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    Grid_FFT<real_t> A3y({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    Grid_FFT<real_t> A3z({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+
    //... array [.] access to components of A3:
-    std::array< Grid_FFT<real_t>*,3 > A3({&A3x,&A3y,&A3z});
+    std::array<Grid_FFT<real_t> *, 3> A3({&A3x, &A3y, &A3z});
+
+    // white noise field 
+    Grid_FFT<real_t> wnoise({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+    
+    //--------------------------------------------------------------------
+    // Fill the grid with a Gaussian white noise field
+    //--------------------------------------------------------------------
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    music::ilog << "Generating white noise field...." << std::endl;
+
+    the_random_number_generator->Fill_Grid(wnoise);
+    
+    wnoise.FourierTransformForward();
+
+    //--------------------------------------------------------------------
+    // Use externally specified large scale modes from constraints in case
+    //--------------------------------------------------------------------
+    if( bAddConstrainedModes ){
+        Grid_FFT<real_t,false> cwnoise({8,8,8}, {boxlen,boxlen,boxlen});
+        cwnoise.Read_from_HDF5( the_config.get_value<std::string>("setup", "ConstraintFieldFile"), 
+                the_config.get_value<std::string>("setup", "ConstraintFieldName") );
+        cwnoise.FourierTransformForward();
+
+        size_t ngrid_c = cwnoise.size(0), ngrid_c_2 = ngrid_c/2;
+
+        // TODO: copy over modes
+        double rs1{0.0},rs2{0.0},is1{0.0},is2{0.0};
+        double nrs1{0.0},nrs2{0.0},nis1{0.0},nis2{0.0};
+        size_t count{0};
+
+        #pragma omp parallel for reduction(+:rs1,rs2,is1,is2,nrs1,nrs2,nis1,nis2,count)
+        for( size_t i=0; i<ngrid_c; ++i ){
+            size_t il = size_t(-1);
+            if( i<ngrid_c_2 && i<ngrid/2 ) il = i;
+            if( i>ngrid_c_2 && i+ngrid-ngrid_c>ngrid/2) il = ngrid-ngrid_c+i;
+            if( il == size_t(-1) ) continue;
+            if( il<size_t(wnoise.local_1_start_) || il>=size_t(wnoise.local_1_start_+wnoise.local_1_size_)) continue;
+            il -= wnoise.local_1_start_;
+            for( size_t j=0; j<ngrid_c; ++j ){
+                size_t jl = size_t(-1);
+                if( j<ngrid_c_2 && j<ngrid/2 ) jl = j;
+                if( j>ngrid_c_2 && j+ngrid-ngrid_c>ngrid/2 ) jl = ngrid-ngrid_c+j;
+                if( jl == size_t(-1) ) continue;
+                for( size_t k=0; k<ngrid_c/2+1; ++k ){
+                    if( k>ngrid/2 ) continue;
+                    size_t kl = k;
+                    
+                    ++count;
+
+                    nrs1 += std::real(cwnoise.kelem(i,j,k));
+                    nrs2 += std::real(cwnoise.kelem(i,j,k))*std::real(cwnoise.kelem(i,j,k));
+                    nis1 += std::imag(cwnoise.kelem(i,j,k));
+                    nis2 += std::imag(cwnoise.kelem(i,j,k))*std::imag(cwnoise.kelem(i,j,k));
+
+                    rs1 += std::real(wnoise.kelem(il,jl,kl));
+                    rs2 += std::real(wnoise.kelem(il,jl,kl))*std::real(wnoise.kelem(il,jl,kl));
+                    is1 += std::imag(wnoise.kelem(il,jl,kl));
+                    is2 += std::imag(wnoise.kelem(il,jl,kl))*std::imag(wnoise.kelem(il,jl,kl));
+                    
+                #if defined(USE_MPI)
+                    wnoise.kelem(il,jl,kl) = cwnoise.kelem(j,i,k);
+                #else
+                    wnoise.kelem(il,jl,kl) = cwnoise.kelem(i,j,k);
+                #endif
+                }
+            }
+        }
+
+        // music::ilog << "  ... old field: re <w>=" << rs1/count << " <w^2>-<w>^2=" << rs2/count-rs1*rs1/count/count << std::endl;
+        // music::ilog << "  ... old field: im <w>=" << is1/count << " <w^2>-<w>^2=" << is2/count-is1*is1/count/count << std::endl;
+        // music::ilog << "  ... new field: re <w>=" << nrs1/count << " <w^2>-<w>^2=" << nrs2/count-nrs1*nrs1/count/count << std::endl;
+        // music::ilog << "  ... new field: im <w>=" << nis1/count << " <w^2>-<w>^2=" << nis2/count-nis1*nis1/count/count << std::endl;
+        music::ilog << "White noise field large-scale modes overwritten with external field." << std::endl;
+    }
+
+    //--------------------------------------------------------------------
+    // Apply Normalisation factor and Angulo&Pontzen fixing or not
+    //--------------------------------------------------------------------
+
+    wnoise.apply_function_k( [&](auto wn){
+        if (bDoFixing)
+            wn = (std::abs(wn) != 0.0) ? wn / std::abs(wn) : wn;
+        return wn / volfac;
+    });
+
+
+    //--------------------------------------------------------------------
+    // Compute the LPT terms....
+    //--------------------------------------------------------------------

    //--------------------------------------------------------------------
    // Create convolution class instance for non-linear terms
    //--------------------------------------------------------------------
+#if defined(USE_CONVOLVER_ORSZAG)
    OrszagConvolver<real_t> Conv({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-    // NaiveConvolver<real_t> Conv({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+#elif defined(USE_CONVOLVER_NAIVE)
+    NaiveConvolver<real_t> Conv({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+#endif
    //--------------------------------------------------------------------

-    std::vector<cosmo_species> species_list;
-    species_list.push_back( cosmo_species::dm );
-    if( bDoBaryons ) species_list.push_back( cosmo_species::baryon );
+    //--------------------------------------------------------------------
+    // Create PLT gradient operator
+    //--------------------------------------------------------------------
+#if defined(ENABLE_PLT)
+    particle::lattice_gradient lg( the_config );
+#else
+    op::fourier_gradient lg( the_config );
+#endif

-    csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    //--------------------------------------------------------------------
+    std::vector<cosmo_species> species_list;
+    species_list.push_back(cosmo_species::dm);
+    if (bDoBaryons)
+        species_list.push_back(cosmo_species::baryon);
+
+    //======================================================================
+    //... compute 1LPT displacement potential ....
+    //======================================================================
+    // phi = - delta / k^2
+
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    music::ilog << "Generating white noise field...." << std::endl;
+
+    double wtime = get_wtime();
+    music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(1) term" << std::flush;
+
+    phi.FourierTransformForward(false);
+    phi.assign_function_of_grids_kdep([&](auto k, auto wn) {
+        real_t kmod = k.norm();
+        ccomplex_t delta = wn * the_cosmo_calc->get_amplitude(kmod, total);
+        return -delta / (kmod * kmod);
+    }, wnoise);
+
+    phi.zero_DC_mode();
+
+    music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+
+    //======================================================================
+    //... compute 2LPT displacement potential ....
+    //======================================================================
+    if (LPTorder > 1)
+    {
+        wtime = get_wtime();
+        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(2) term" << std::flush;
+        phi2.FourierTransformForward(false);
+        Conv.convolve_SumOfHessians(phi, {0, 0}, phi, {1, 1}, {2, 2}, op::assign_to(phi2));
+        Conv.convolve_Hessians(phi, {1, 1}, phi, {2, 2}, op::add_to(phi2));
+        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, op::subtract_from(phi2));
+        Conv.convolve_Hessians(phi, {0, 2}, phi, {0, 2}, op::subtract_from(phi2));
+        Conv.convolve_Hessians(phi, {1, 2}, phi, {1, 2}, op::subtract_from(phi2));
+
+        if (bAddExternalTides)
+        {
+            phi2.assign_function_of_grids_kdep([&](vec3_t<real_t> kvec, ccomplex_t pphi, ccomplex_t pphi2) {
+                // sign in front of f_aniso is reversed since phi1 = -phi
+                return pphi2 + f_aniso * (kvec[0] * kvec[0] * lss_aniso_lambda[0] + kvec[1] * kvec[1] * lss_aniso_lambda[1] + kvec[2] * kvec[2] * lss_aniso_lambda[2]) * pphi;
+            },
+                                               phi, phi2);
+        }
+
+        phi2.apply_InverseLaplacian();
+        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+
+        if (bAddExternalTides)
+        {
+            music::wlog << "Added external tide contribution to phi(2)... Make sure your N-body code supports this!" << std::endl;
+            music::wlog << " lss_aniso = (" << lss_aniso_lambda[0] << ", " << lss_aniso_lambda[1] << ", " << lss_aniso_lambda[2] << ")" << std::endl;
+        }
+    }
+
+    //======================================================================
+    //... compute 3LPT displacement potential
+    //======================================================================
+    if (LPTorder > 2)
+    {
+        //... 3a term ...
+        wtime = get_wtime();
+        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3a) term" << std::flush;
+        phi3a.FourierTransformForward(false);
+        Conv.convolve_Hessians(phi, {0, 0}, phi, {1, 1}, phi, {2, 2}, op::assign_to(phi3a));
+        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 2}, phi, {1, 2}, op::multiply_add_to(phi3a,2.0));
+        Conv.convolve_Hessians(phi, {1, 2}, phi, {1, 2}, phi, {0, 0}, op::subtract_from(phi3a));
+        Conv.convolve_Hessians(phi, {0, 2}, phi, {0, 2}, phi, {1, 1}, op::subtract_from(phi3a));
+        Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, phi, {2, 2}, op::subtract_from(phi3a));
+        phi3a.apply_InverseLaplacian();
+        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+
+        //... 3b term ...
+        wtime = get_wtime();
+        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3b) term" << std::flush;
+        phi3b.FourierTransformForward(false);
+        Conv.convolve_SumOfHessians(phi, {0, 0}, phi2, {1, 1}, {2, 2}, op::assign_to(phi3b));
+        Conv.convolve_SumOfHessians(phi, {1, 1}, phi2, {2, 2}, {0, 0}, op::add_to(phi3b));
+        Conv.convolve_SumOfHessians(phi, {2, 2}, phi2, {0, 0}, {1, 1}, op::add_to(phi3b));
+        Conv.convolve_Hessians(phi, {0, 1}, phi2, {0, 1}, op::multiply_add_to(phi3b,-2.0));
+        Conv.convolve_Hessians(phi, {0, 2}, phi2, {0, 2}, op::multiply_add_to(phi3b,-2.0));
+        Conv.convolve_Hessians(phi, {1, 2}, phi2, {1, 2}, op::multiply_add_to(phi3b,-2.0));
+        phi3b.apply_InverseLaplacian();
+        phi3b *= 0.5; // factor 1/2 from definition of phi(3b)!
+        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+
+        //... transversal term ...
+        wtime = get_wtime();
+        music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing A(3) term" << std::flush;
+        for (int idim = 0; idim < 3; ++idim)
+        {
+            // cyclic rotations of indices
+            int idimp = (idim + 1) % 3, idimpp = (idim + 2) % 3;
+            A3[idim]->FourierTransformForward(false);
+            Conv.convolve_Hessians(phi2, {idim, idimp}, phi, {idim, idimpp}, op::assign_to(*A3[idim]));
+            Conv.convolve_Hessians(phi2, {idim, idimpp}, phi, {idim, idimp}, op::subtract_from(*A3[idim]));
+            Conv.convolve_DifferenceOfHessians(phi, {idimp, idimpp}, phi2, {idimp, idimp}, {idimpp, idimpp}, op::add_to(*A3[idim]));
+            Conv.convolve_DifferenceOfHessians(phi2, {idimp, idimpp}, phi, {idimp, idimp}, {idimpp, idimpp}, op::subtract_from(*A3[idim]));
+            A3[idim]->apply_InverseLaplacian();
+        }
+        music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl;
+    }
+
+    // if( bSymplecticPT ){
+    //     //... transversal term ...
+    //     wtime = get_wtime();
+    //     music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing vNLO(3) term" << std::flush;
+    //     for( int idim=0; idim<3; ++idim ){
+    //         // cyclic rotations of indices
+    //         A3[idim]->FourierTransformForward(false);
+    //         Conv.convolve_Gradient_and_Hessian( phi, {0},  phi2, {idim,0}, assign_to(*A3[idim]) );
+    //         Conv.convolve_Gradient_and_Hessian( phi, {1},  phi2, {idim,1}, add_to(*A3[idim]) );
+    //         Conv.convolve_Gradient_and_Hessian( phi, {2},  phi2, {idim,2}, add_to(*A3[idim]) );
+    //     }
+    //     music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
+
+    // }
+
+    ///... scale all potentials with respective growth factors
+    phi *= g1;
+    phi2 *= g2;
+    phi3a *= g3a;
+    phi3b *= g3b;
+    (*A3[0]) *= g3c;
+    (*A3[1]) *= g3c;
+    (*A3[2]) *= g3c;
+
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+
+    ///////////////////////////////////////////////////////////////////////
+    // we store the densities here if we compute them
+    //======================================================================
+
+    // Testing
+    const std::string testing = the_config.get_value_safe<std::string>("testing", "test", "none");
+
+    if (testing != "none")
+    {
+        music::wlog << "you are running in testing mode. No ICs, only diagnostic output will be written out!" << std::endl;
+        if (testing == "potentials_and_densities"){
+            testing::output_potentials_and_densities(the_config, ngrid, boxlen, phi, phi2, phi3a, phi3b, A3);
+        }
+        else if (testing == "velocity_displacement_symmetries"){
+            testing::output_velocity_displacement_symmetries(the_config, ngrid, boxlen, vfac, Dplus0, phi, phi2, phi3a, phi3b, A3);
+        }
+        else if (testing == "convergence"){
+            testing::output_convergence(the_config, the_cosmo_calc.get(), ngrid, boxlen, vfac, Dplus0, phi, phi2, phi3a, phi3b, A3);
+        }
+        else{
+            music::flog << "unknown test '" << testing << "'" << std::endl;
+            std::abort();
+        }
+    }

    for( auto& this_species : species_list )
    {
-        csoca::ilog << std::endl
+        music::ilog << std::endl
                    << ">>> Computing ICs for species \'" << cosmo_species_name[this_species] << "\' <<<\n" << std::endl;

-        //======================================================================
-        //... compute 1LPT displacement potential ....
-        //======================================================================
-        // phi = - delta / k^2
-        double wtime = get_wtime();
-        csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(1) term" << std::flush;
-
-        #if 1 //  random ICs
-        //--------------------------------------------------------------------
-        // Fill the grid with a Gaussian white noise field
-        //--------------------------------------------------------------------
-        the_random_number_generator->Fill_Grid( phi );
-
-        phi.FourierTransformForward();
-
-        phi.apply_function_k_dep([&](auto x, auto k) -> ccomplex_t {
-            real_t kmod = k.norm();
-            if( bDoFixing ) x = (std::abs(x)!=0.0)? x / std::abs(x) : x; 
-            ccomplex_t delta = x * the_cosmo_calc->GetAmplitude(kmod, total);
-            return -delta / (kmod * kmod) / volfac;
-        });
-
-        phi.zero_DC_mode();
-        #else // ICs with a given phi(1) potential function
-        constexpr real_t twopi{2.0*M_PI};
-        constexpr real_t epsilon_q1d{0.25};
-
-        constexpr real_t epsy{0.25};
-        constexpr real_t epsz{0.0};//epsz{0.25};
-        
-        phi.FourierTransformBackward(false);
-
-        phi.apply_function_r_dep([&](auto v, auto r) -> real_t {
-            real_t q1 = r[0]-0.5*boxlen;//r[0]/boxlen * twopi - M_PI;
-            real_t q2 = r[1]-0.5*boxlen;//r[1]/boxlen * twopi - M_PI;
-            real_t q3 = r[2]-0.5*boxlen;//r[1]/boxlen * twopi - M_PI;
-
-            // std::cerr << q1  << " " << q2 << std::endl;
-            
-            return -2.0*std::cos(q1+std::cos(q2));
-            // return (-std::cos(q1) + epsilon_q1d * std::sin(q2));
-            // return (-std::cos(q1) + epsy * std::sin(q2) + epsz * std::cos(q1) * std::sin(q3));
-        });
-        phi.FourierTransformForward();
-
-
-        #endif
-        csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-
-        //======================================================================
-        //... compute 2LPT displacement potential ....
-        //======================================================================
-        if( LPTorder > 1 ){
-            wtime = get_wtime();
-            csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(2) term" << std::flush;
-            phi2.FourierTransformForward(false);
-            Conv.convolve_SumOfHessians( phi, {0,0}, phi, {1,1}, {2,2}, op::assign_to( phi2 ) );
-            Conv.convolve_Hessians( phi, {1,1}, phi, {2,2}, op::add_to(phi2) );
-            Conv.convolve_Hessians( phi, {0,1}, phi, {0,1}, op::subtract_from(phi2) );
-            Conv.convolve_Hessians( phi, {0,2}, phi, {0,2}, op::subtract_from(phi2) );
-            Conv.convolve_Hessians( phi, {1,2}, phi, {1,2}, op::subtract_from(phi2) );
-
-            if( bAddExternalTides ){
-                phi2.assign_function_of_grids_kdep([&]( vec3<real_t> kvec, ccomplex_t pphi, ccomplex_t pphi2 ){
-                    // sign in front of f_aniso is reversed since phi1 = -phi
-                    return pphi2 + f_aniso * (kvec[0]*kvec[0]*lss_aniso_lambda[0]+kvec[1]*kvec[1]*lss_aniso_lambda[1]+kvec[2]*kvec[2]*lss_aniso_lambda[2])*pphi;
-                }, phi, phi2 );
-            }
-
-            phi2.apply_InverseLaplacian();
-            csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-
-            if( bAddExternalTides ){
-                csoca::wlog << "Added external tide contribution to phi(2)... Make sure your N-body code supports this!" << std::endl;
-                csoca::wlog << " lss_aniso = (" << lss_aniso_lambda[0] << ", " << lss_aniso_lambda[1] << ", " << lss_aniso_lambda[2] << ")" << std::endl;
-            }
-        }
-
-        //======================================================================
-        //... compute 3LPT displacement potential
-        //======================================================================
-        if( LPTorder > 2 ){
-            //... 3a term ...
-            wtime = get_wtime();
-            csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3a) term" << std::flush;
-            phi3a.FourierTransformForward(false);
-            Conv.convolve_Hessians( phi, {0,0}, phi, {1,1}, phi, {2,2}, op::assign_to(phi3a) );
-            Conv.convolve_Hessians( phi, {0,1}, phi, {0,2}, phi, {1,2}, op::add_twice_to(phi3a) );
-            Conv.convolve_Hessians( phi, {1,2}, phi, {1,2}, phi, {0,0}, op::subtract_from(phi3a) );
-            Conv.convolve_Hessians( phi, {0,2}, phi, {0,2}, phi, {1,1}, op::subtract_from(phi3a) );
-            Conv.convolve_Hessians( phi, {0,1}, phi, {0,1}, phi, {2,2}, op::subtract_from(phi3a) );
-            phi3a.apply_InverseLaplacian();
-            csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-
-            //... 3b term ...
-            wtime = get_wtime();
-            csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3b) term" << std::flush;
-            phi3b.FourierTransformForward(false);
-            Conv.convolve_SumOfHessians( phi, {0,0}, phi2, {1,1}, {2,2}, op::assign_to(phi3b) );
-            Conv.convolve_SumOfHessians( phi, {1,1}, phi2, {2,2}, {0,0}, op::add_to(phi3b) );
-            Conv.convolve_SumOfHessians( phi, {2,2}, phi2, {0,0}, {1,1}, op::add_to(phi3b) );
-            Conv.convolve_Hessians( phi, {0,1}, phi2, {0,1}, op::subtract_twice_from(phi3b) );
-            Conv.convolve_Hessians( phi, {0,2}, phi2, {0,2}, op::subtract_twice_from(phi3b) );
-            Conv.convolve_Hessians( phi, {1,2}, phi2, {1,2}, op::subtract_twice_from(phi3b) );
-            phi3b.apply_InverseLaplacian();
-            phi3b *= 0.5; // factor 1/2 from definition of phi(3b)!
-            csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-
-            //... transversal term ...
-            wtime = get_wtime();
-            csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing A(3) term" << std::flush;
-            for( int idim=0; idim<3; ++idim ){
-                // cyclic rotations of indices
-                int idimp = (idim+1)%3, idimpp = (idim+2)%3;
-                A3[idim]->FourierTransformForward(false);
-                Conv.convolve_Hessians( phi2, {idim,idimp},  phi, {idim,idimpp}, op::assign_to(*A3[idim]) );
-                Conv.convolve_Hessians( phi2, {idim,idimpp}, phi, {idim,idimp},  op::subtract_from(*A3[idim]) );
-                Conv.convolve_DifferenceOfHessians( phi, {idimp,idimpp}, phi2,{idimp,idimp}, {idimpp,idimpp}, op::add_to(*A3[idim]) );
-                Conv.convolve_DifferenceOfHessians( phi2,{idimp,idimpp}, phi, {idimp,idimp}, {idimpp,idimpp}, op::subtract_from(*A3[idim]) );
-                A3[idim]->apply_InverseLaplacian();
-            }
-            csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-        }
-
-        // if( bSymplecticPT ){
-        //     //... transversal term ...
-        //     wtime = get_wtime();
-        //     csoca::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing vNLO(3) term" << std::flush;
-        //     for( int idim=0; idim<3; ++idim ){
-        //         // cyclic rotations of indices
-        //         A3[idim]->FourierTransformForward(false);
-        //         Conv.convolve_Gradient_and_Hessian( phi, {0},  phi2, {idim,0}, assign_to(*A3[idim]) );
-        //         Conv.convolve_Gradient_and_Hessian( phi, {1},  phi2, {idim,1}, add_to(*A3[idim]) );
-        //         Conv.convolve_Gradient_and_Hessian( phi, {2},  phi2, {idim,2}, add_to(*A3[idim]) );
-        //     }
-        //     csoca::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime()-wtime << "s" << std::endl;
-
-        // }
-
-        ///... scale all potentials with respective growth factors
-        phi *= g1;
-        phi2 *= g2;
-        phi3a *= g3a;
-        phi3b *= g3b;
-        (*A3[0]) *= g3c;
-        (*A3[1]) *= g3c;
-        (*A3[2]) *= g3c;
-
-        csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
-
-        ///////////////////////////////////////////////////////////////////////
-        // we store the densities here if we compute them
-        //======================================================================
-
-        // Testing
-        const std::string testing = the_config.GetValueSafe<std::string>("testing", "test", "none");
-
-        if(testing != "none") {
-            csoca::wlog << "you are running in testing mode. No ICs, only diagnostic output will be written out!" << std::endl;
-            if(testing == "potentials_and_densities") {
-                testing::output_potentials_and_densities(the_config, ngrid, boxlen, phi, phi2, phi3a, phi3b, A3);
-            } else if(testing == "velocity_displacement_symmetries") {
-                testing::output_velocity_displacement_symmetries(the_config, ngrid, boxlen, vfac, Dplus0, phi, phi2, phi3a, phi3b, A3);
-            } else if(testing == "convergence") {
-                testing::output_convergence(the_config, ngrid, boxlen, vfac, Dplus0, phi, phi2, phi3a, phi3b, A3);
-            } else {
-                csoca::flog << "unknown test '" << testing << "'" << std::endl;
-                std::abort();
-        }
-        } else {
+        {
            // temporary storage of data
            Grid_FFT<real_t> tmp({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});

+            std::unique_ptr<particle::lattice_generator<Grid_FFT<real_t>>> particle_lattice_generator_ptr;
+
+            // if output plugin wants particles, then we need to store them, along with their IDs
+            if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
+            {
+                // somewhat arbitrarily, start baryon particle IDs from 2**31 if we have 32bit and from 2**56 if we have 64 bits
+                size_t IDoffset = (this_species == cosmo_species::baryon)? ((the_output_plugin->has_64bit_ids())? 1ul<<56 : 1ul<<31): 0 ;
+
+                // allocate particle structure and generate particle IDs
+                particle_lattice_generator_ptr = 
+                std::make_unique<particle::lattice_generator<Grid_FFT<real_t>>>( lattice_type, the_output_plugin->has_64bit_reals(), the_output_plugin->has_64bit_ids(), IDoffset, tmp, the_config );
+            }
+

            //if( the_output_plugin->write_species_as( cosmo_species::dm ) == output_type::field_eulerian ){
            if( the_output_plugin->write_species_as(this_species) == output_type::field_eulerian )
@ -362,7 +482,7 @@ int Run( ConfigFile& the_config )
                real_t std_phi1 = phi.std();

                const real_t hbar = 2.0 * M_PI/ngrid * (2*std_phi1/Dplus0); //3sigma, but this might rather depend on gradients of phi...
-                csoca::ilog << "Semiclassical PT : hbar = " << hbar << " from sigma(phi1) = " << std_phi1 << std::endl;
+                music::ilog << "Semiclassical PT : hbar = " << hbar << " from sigma(phi1) = " << std_phi1 << std::endl;
                
                if( LPTorder == 1 ){
                    psi.assign_function_of_grids_r([hbar,Dplus0]( real_t pphi ){
@ -435,14 +555,21 @@ int Run( ConfigFile& the_config )
                //===================================================================================
                // we store displacements and velocities here if we compute them
                //===================================================================================
-                particle::container particles;
+                
+
+                bool shifted_lattice = (this_species == cosmo_species::baryon &&
+                                        the_output_plugin->write_species_as(this_species) == output_type::particles) ? true : false;
+
+                
+
+                grid_interpolate<1,Grid_FFT<real_t>> interp( tmp );

                // if output plugin wants particles, then we need to store them, along with their IDs
-                if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
-                {
-                    // allocate particle structure and generate particle IDs
-                    particle::initialize_lattice( particles, lattice_type, tmp );
-                }
+                // if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
+                // {
+                //     // allocate particle structure and generate particle IDs
+                //     particle::initialize_lattice( particles, lattice_type, the_output_plugin->has_64bit_reals(), the_output_plugin->has_64bit_ids(), IDoffset, tmp, the_config );
+                // }
            
                // write out positions
                for( int idim=0; idim<3; ++idim ){
@ -459,17 +586,37 @@ int Run( ConfigFile& the_config )
                                size_t idx = phi.get_idx(i,j,k);
                                auto phitot = phi.kelem(idx) + phi2.kelem(idx) + phi3a.kelem(idx) + phi3b.kelem(idx);
                                // divide by Lbox, because displacement is in box units for output plugin
-                                tmp.kelem(idx) = lunit / boxlen * ( phi.gradient(idim,{i,j,k}) * phitot 
-                                    + phi.gradient(idimp,{i,j,k}) * A3[idimpp]->kelem(idx) - phi.gradient(idimpp,{i,j,k}) * A3[idimp]->kelem(idx) );
+                                tmp.kelem(idx) = lunit / boxlen * ( lg.gradient(idim,tmp.get_k3(i,j,k)) * phitot 
+                                    + lg.gradient(idimp,tmp.get_k3(i,j,k)) * A3[idimpp]->kelem(idx) - lg.gradient(idimpp,tmp.get_k3(i,j,k)) * A3[idimp]->kelem(idx) );
+
+                                if( the_output_plugin->write_species_as( this_species ) == output_type::particles && lattice_type == particle::lattice_glass){
+                                    tmp.kelem(idx) *= interp.compensation_kernel( tmp.get_k<real_t>(i,j,k) );
+                                }
+
+                                if( bDoBaryons ){
+                                    vec3_t<real_t> kvec = phi.get_k<real_t>(i,j,k);
+                                    real_t k2 = kvec.norm_squared(), kmod = std::sqrt(k2);
+                                    // double ampldiff = ((this_species == cosmo_species::dm)? the_cosmo_calc->get_amplitude(kmod, cdm) :
+                                    //  (this_species == cosmo_species::baryon)? the_cosmo_calc->get_amplitude(kmod, baryon) : 
+                                    // //   the_cosmo_calc->get_amplitude(kmod, total)) - the_cosmo_calc->get_amplitude(kmod, total);
+                                    //  the_cosmo_calc->get_amplitude(kmod, total)*(-g1)) - the_cosmo_calc->get_amplitude(kmod, total)*(-g1);
+
+                                    real_t ampldiff = (((this_species == cosmo_species::dm)? the_cosmo_calc->get_amplitude(kmod, cdm) 
+                                        : (this_species == cosmo_species::baryon)? the_cosmo_calc->get_amplitude(kmod, baryon) : 
+                                           the_cosmo_calc->get_amplitude(kmod, total)) - the_cosmo_calc->get_amplitude(kmod, total)) * (-g1);
+
+                                    tmp.kelem(idx) += lg.gradient(idim, tmp.get_k3(i,j,k)) * wnoise.kelem(idx) * lunit * ampldiff / k2 / boxlen;
+                                }
                            }
                        }
                    }
+                    tmp.zero_DC_mode();
                    tmp.FourierTransformBackward();

                    // if we write particle data, store particle data in particle structure
                    if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
                    {
-                        particle::set_positions( particles, lattice_type, idim, lunit, tmp );
+                        particle_lattice_generator_ptr->set_positions( lattice_type, shifted_lattice, idim, lunit, the_output_plugin->has_64bit_reals(), tmp, the_config );
                    } 
                    // otherwise write out the grid data directly to the output plugin
                    // else if( the_output_plugin->write_species_as( cosmo_species::dm ) == output_type::field_lagrangian )
@ -496,8 +643,29 @@ int Run( ConfigFile& the_config )
                                // divide by Lbox, because displacement is in box units for output plugin
                                auto phitot_v = vfac1 * phi.kelem(idx) + vfac2 * phi2.kelem(idx) + vfac3 * (phi3a.kelem(idx) + phi3b.kelem(idx));

-                                tmp.kelem(idx) = vunit / boxlen * ( phi.gradient(idim,{i,j,k}) * phitot_v 
-                                        + vfac3 * (phi.gradient(idimp,{i,j,k}) * A3[idimpp]->kelem(idx) - phi.gradient(idimpp,{i,j,k}) * A3[idimp]->kelem(idx)) );
+                                tmp.kelem(idx) = vunit / boxlen * ( lg.gradient(idim,tmp.get_k3(i,j,k)) * phitot_v 
+                                        + vfac3 * (lg.gradient(idimp,tmp.get_k3(i,j,k)) * A3[idimpp]->kelem(idx) - lg.gradient(idimpp,tmp.get_k3(i,j,k)) * A3[idimp]->kelem(idx)) );
+
+                                if( the_output_plugin->write_species_as( this_species ) == output_type::particles && lattice_type == particle::lattice_glass){
+                                    tmp.kelem(idx) *= interp.compensation_kernel( tmp.get_k<real_t>(i,j,k) );
+                                }
+
+                                if( bDoBaryons ){
+                                    vec3_t<real_t> kvec = phi.get_k<real_t>(i,j,k);
+                                    real_t k2 = kvec.norm_squared(), kmod = std::sqrt(k2);
+                                    // double ampldiff = ((this_species == cosmo_species::dm)? the_cosmo_calc->get_amplitude(kmod, vcdm0) :
+                                    //  (this_species == cosmo_species::baryon)? the_cosmo_calc->get_amplitude(kmod, vbaryon0) : 
+                                    //      the_cosmo_calc->get_amplitude(kmod, vtotal0)) - the_cosmo_calc->get_amplitude(kmod, vtotal0);
+                                    // // the_cosmo_calc->get_amplitude(kmod, total)*(-g1)) - the_cosmo_calc->get_amplitude(kmod, total)*(-g1);
+                                    real_t ampldiff = (((this_species == cosmo_species::dm)? the_cosmo_calc->get_amplitude(kmod, vcdm) 
+                                        : (this_species == cosmo_species::baryon)? the_cosmo_calc->get_amplitude(kmod, vbaryon) : 
+                                           the_cosmo_calc->get_amplitude(kmod, vtotal)) - the_cosmo_calc->get_amplitude(kmod, vtotal)) * (-g1);
+                                    tmp.kelem(idx) += lg.gradient(idim, tmp.get_k3(i,j,k)) * wnoise.kelem(idx) * vfac1 * vunit / boxlen * ampldiff / k2 ;
+                                }
+
+                                // correct velocity with PLT mode growth rate
+                                tmp.kelem(idx) *= lg.vfac_corr(tmp.get_k3(i,j,k));
+

                                if( bAddExternalTides ){
                                    // modify velocities with anisotropic expansion factor**2
@ -510,12 +678,13 @@ int Run( ConfigFile& the_config )
                            }
                        }
                    }
+                    tmp.zero_DC_mode();
                    tmp.FourierTransformBackward();

                    // if we write particle data, store particle data in particle structure
                    if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
                    {
-                        particle::set_velocities( particles, lattice_type, idim, tmp );
+                        particle_lattice_generator_ptr->set_velocities( lattice_type, shifted_lattice, idim, the_output_plugin->has_64bit_reals(), tmp, the_config );
                    }
                    // otherwise write out the grid data directly to the output plugin
                    else if( the_output_plugin->write_species_as( this_species ) == output_type::field_lagrangian )
@ -527,7 +696,7 @@ int Run( ConfigFile& the_config )

                if( the_output_plugin->write_species_as( this_species ) == output_type::particles )
                {
-                    the_output_plugin->write_particle_data( particles, this_species );
+                    the_output_plugin->write_particle_data( particle_lattice_generator_ptr->get_particles(), this_species, Omega[this_species] );
                }
                
                if( the_output_plugin->write_species_as( this_species ) == output_type::field_lagrangian )
--- a/src/logger.cc
+++ b/src/logger.cc
@ -1,19 +1,19 @@
 #include <logger.hh>

-namespace csoca {
+namespace music {

-std::ofstream Logger::output_file_;
-LogLevel Logger::log_level_ = LogLevel::Off;
+std::ofstream logger::output_file_;
+log_level logger::log_level_ = log_level::off;

-void Logger::SetLevel(const LogLevel &level) {
+void logger::set_level(const log_level &level) {
  log_level_ = level;
 }

-LogLevel Logger::GetLevel() {
+log_level logger::get_level() {
  return log_level_;
 }

-void Logger::SetOutput(const std::string filename) {
+void logger::set_output(const std::string filename) {
  if (output_file_.is_open()) {
    output_file_.close();
  }
@ -21,22 +21,22 @@ void Logger::SetOutput(const std::string filename) {
  assert(output_file_.is_open());
 }

-void Logger::UnsetOutput() {
+void logger::unset_output() {
  if (output_file_.is_open()) {
    output_file_.close();
  }
 }

-std::ofstream &Logger::GetOutput() {
+std::ofstream &logger::get_output() {
  return output_file_;
 }

 // global instantiations for different levels
-Logger glogger;
-LogStream flog(glogger, LogLevel::Fatal);
-LogStream elog(glogger, LogLevel::Error);
-LogStream wlog(glogger, LogLevel::Warning);
-LogStream ilog(glogger, LogLevel::Info);
-LogStream dlog(glogger, LogLevel::Debug);
+logger the_logger;
+log_stream flog(the_logger, log_level::fatal);
+log_stream elog(the_logger, log_level::error);
+log_stream wlog(the_logger, log_level::warning);
+log_stream ilog(the_logger, log_level::info);
+log_stream dlog(the_logger, log_level::debug);

-} // namespace csoca
+} // namespace music
--- a/src/main.cc
+++ b/src/main.cc
@ -3,6 +3,7 @@
 #include <iostream>
 #include <fstream>
 #include <thread>
+#include <cfenv>

 #if defined(_OPENMP)
 #include <omp.h>
@ -10,6 +11,7 @@

 #include <general.hh>
 #include <ic_generator.hh>
+#include <particle_plt.hh>


 // initialise with "default" values
@ -26,10 +28,28 @@ int  num_threads = 1;

 #include "system_stat.hh"

+#include <exception>
+#include <stdexcept>
+ 
+void handle_eptr(std::exception_ptr eptr) // passing by value is ok
+{
+    try {
+        if (eptr) {
+            std::rethrow_exception(eptr);
+        }
+    } catch(const std::exception& e) {
+        music::elog << "This happened: \"" << e.what() << "\"" << std::endl;
+    }
+}
+
 int main( int argc, char** argv )
 {
-    csoca::Logger::SetLevel(csoca::LogLevel::Info);
-    // csoca::Logger::SetLevel(csoca::LogLevel::Debug);
+
+#if defined(NDEBUG)
+    music::logger::set_level(music::log_level::info);
+#else
+    music::logger::set_level(music::log_level::debug);
+#endif

    //------------------------------------------------------------------------------
    // initialise MPI 
@ -45,19 +65,38 @@ int main( int argc, char** argv )
    // set up lower logging levels for other tasks
    if( CONFIG::MPI_task_rank!=0 )
    {
-        csoca::Logger::SetLevel(csoca::LogLevel::Error);
+        music::logger::set_level(music::log_level::error);
    }
 #endif

-    csoca::ilog << "\n"
-                << " unigrid MUSIC                          .8888b                   dP  a88888b. \n"
+    // Ascii ART logo. generated via http://patorjk.com/software/taag/#p=display&f=Nancyj&t=monofonIC
+    music::ilog << "\n"
+                << " The unigrid version of MUSIC-2         .8888b                   dP  a88888b. \n"
                << "                                        88   \"                   88 d8\'   `88 \n"
                << "  88d8b.d8b. .d8888b. 88d888b. .d8888b. 88aaa  .d8888b. 88d888b. 88 88        \n"
                << "  88\'`88\'`88 88\'  `88 88\'  `88 88\'  `88 88     88\'  `88 88\'  `88 88 88        \n"
                << "  88  88  88 88.  .88 88    88 88.  .88 88     88.  .88 88    88 88 Y8.   .88 \n"
-                << "  dP  dP  dP `88888P\' dP    dP `88888P\' dP     `88888P\' dP    dP dP  Y88888P\' \n" << std::endl
-                << "version  : v0.1a, git rev. : " << GIT_REV << ", tag: " << GIT_TAG << ", branch: " << GIT_BRANCH << std::endl
-                << "-------------------------------------------------------------------------------" << std::endl;
+                << "  dP  dP  dP `88888P\' dP    dP `88888P\' dP     `88888P\' dP    dP dP  Y88888P\' \n" << std::endl;
+
+    // git and versioning info:
+    music::ilog << "Version: v0.1a, git rev.: " << GIT_REV << ", tag: " << GIT_TAG << ", branch: " << GIT_BRANCH << std::endl;
+    
+    // Compilation CMake configuration, time etc info:
+    music::ilog << "This " << CMAKE_BUILDTYPE_STR << " build was compiled at " << __TIME__ << " on " <<  __DATE__ << std::endl;
+
+#ifdef __GNUC__
+    music::ilog << "Compiled with GNU C++ version " << __VERSION__ <<std::endl;
+#else
+    music::ilog << "Compiled with " << __VERSION__ << std::endl;
+#endif
+
+    
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    music::ilog << "Compile time options : " << std::endl;
+    music::ilog << "                       Precision : " << CMAKE_PRECISION_STR << std::endl;
+    music::ilog << "                    Convolutions : " << CMAKE_CONVOLVER_STR << std::endl;
+    music::ilog << "                             PLT : " << CMAKE_PLT_STR << std::endl;
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;


    //------------------------------------------------------------------------------
@ -71,12 +110,12 @@ int main( int argc, char** argv )
        print_RNG_plugins();
        print_output_plugins();

-        csoca::elog << "In order to run, you need to specify a parameter file!" << std::endl;
+        music::elog << "In order to run, you need to specify a parameter file!\n" << std::endl;
        exit(0);
    }

    // open the configuration file 
-    ConfigFile the_config(argv[1]);
+    config_file the_config(argv[1]);

    //------------------------------------------------------------------------------
    // Set up FFTW
@ -95,7 +134,7 @@ int main( int argc, char** argv )
    FFTW_API(mpi_init)();
 #endif

-    CONFIG::num_threads = the_config.GetValueSafe<unsigned>("execution", "NumThreads",std::thread::hardware_concurrency());
+    CONFIG::num_threads = the_config.get_value_safe<unsigned>("execution", "NumThreads",std::thread::hardware_concurrency());
    
 #if defined(USE_FFTW_THREADS)
    if (CONFIG::FFTW_threads_ok)
@ -110,14 +149,16 @@ int main( int argc, char** argv )
    omp_set_num_threads(CONFIG::num_threads);
 #endif

+    // std::feclearexcept(FE_ALL_EXCEPT);
+
    //------------------------------------------------------------------------------
    // Write code configuration to screen
    //------------------------------------------------------------------------------
    // hardware related infos
-    csoca::ilog << std::setw(32) << std::left << "CPU vendor string" << " : " << SystemStat::Cpu().get_CPUstring() << std::endl;
+    music::ilog << std::setw(32) << std::left << "CPU vendor string" << " : " << SystemStat::Cpu().get_CPUstring() << std::endl;
    
    // multi-threading related infos
-    csoca::ilog << std::setw(32) << std::left << "Available HW threads / task" << " : " << std::thread::hardware_concurrency() << " (" << CONFIG::num_threads << " used)" << std::endl;
+    music::ilog << std::setw(32) << std::left << "Available HW threads / task" << " : " << std::thread::hardware_concurrency() << " (" << CONFIG::num_threads << " used)" << std::endl;

    // memory related infos
    SystemStat::Memory mem;
@ -134,34 +175,34 @@ int main( int argc, char** argv )
    MPI_Allreduce(&minupmem,&temp,1,MPI_UNSIGNED,MPI_MIN,MPI_COMM_WORLD); minupmem = temp;
    MPI_Allreduce(&maxupmem,&temp,1,MPI_UNSIGNED,MPI_MAX,MPI_COMM_WORLD); maxupmem = temp;
 #endif
-    csoca::ilog << std::setw(32) << std::left << "Total system memory (phys)" << " : " << mem.get_TotalMem()/1024/1024 << " Mb" << std::endl;
-    csoca::ilog << std::setw(32) << std::left << "Used system memory (phys)" << " : " << "Max: " << maxupmem << " Mb, Min: " << minupmem << " Mb" << std::endl;
-    csoca::ilog << std::setw(32) << std::left << "Available system memory (phys)" << " : " <<  "Max: " << maxpmem << " Mb, Min: " << minpmem << " Mb" << std::endl;
+    music::ilog << std::setw(32) << std::left << "Total system memory (phys)" << " : " << mem.get_TotalMem()/1024/1024 << " Mb" << std::endl;
+    music::ilog << std::setw(32) << std::left << "Used system memory (phys)" << " : " << "Max: " << maxupmem << " Mb, Min: " << minupmem << " Mb" << std::endl;
+    music::ilog << std::setw(32) << std::left << "Available system memory (phys)" << " : " <<  "Max: " << maxpmem << " Mb, Min: " << minpmem << " Mb" << std::endl;
    
    // MPI related infos
 #if defined(USE_MPI)
-    csoca::ilog << std::setw(32) << std::left << "MPI is enabled" << " : " << "yes (" << CONFIG::MPI_task_size << " tasks)" << std::endl;
-    csoca::dlog << std::setw(32) << std::left << "MPI version" << " : " << GetMPIversion() << std::endl;
+    music::ilog << std::setw(32) << std::left << "MPI is enabled" << " : " << "yes (" << CONFIG::MPI_task_size << " tasks)" << std::endl;
+    music::dlog << std::setw(32) << std::left << "MPI version" << " : " << MPI::get_version() << std::endl;
 #else
-    csoca::ilog << std::setw(32) << std::left << "MPI is enabled" << " : " << "no" << std::endl;
+    music::ilog << std::setw(32) << std::left << "MPI is enabled" << " : " << "no" << std::endl;
 #endif
-    csoca::ilog << std::setw(32) << std::left << "MPI supports multi-threading" << " : " << (CONFIG::MPI_threads_ok? "yes" : "no") << std::endl;
+    music::ilog << std::setw(32) << std::left << "MPI supports multi-threading" << " : " << (CONFIG::MPI_threads_ok? "yes" : "no") << std::endl;
    
    // Kernel related infos
    SystemStat::Kernel kern;
    auto kinfo = kern.get_kernel_info();
-    csoca::ilog << std::setw(32) << std::left << "OS/Kernel version" << " : " << kinfo.kernel << " version " << kinfo.major << "." << kinfo.minor << " build " << kinfo.build_number << std::endl;
+    music::ilog << std::setw(32) << std::left << "OS/Kernel version" << " : " << kinfo.kernel << " version " << kinfo.major << "." << kinfo.minor << " build " << kinfo.build_number << std::endl;

    // FFTW related infos
-    csoca::ilog << std::setw(32) << std::left << "FFTW version" << " : " << fftw_version << std::endl;
-    csoca::ilog << std::setw(32) << std::left << "FFTW supports multi-threading" << " : " << (CONFIG::FFTW_threads_ok? "yes" : "no") << std::endl;
-    csoca::ilog << std::setw(32) << std::left << "FFTW mode" << " : ";
+    music::ilog << std::setw(32) << std::left << "FFTW version" << " : " << fftw_version << std::endl;
+    music::ilog << std::setw(32) << std::left << "FFTW supports multi-threading" << " : " << (CONFIG::FFTW_threads_ok? "yes" : "no") << std::endl;
+    music::ilog << std::setw(32) << std::left << "FFTW mode" << " : ";
 #if defined(FFTW_MODE_PATIENT)
-	csoca::ilog << "FFTW_PATIENT" << std::endl;
+	music::ilog << "FFTW_PATIENT" << std::endl;
 #elif defined(FFTW_MODE_MEASURE)
-    csoca::ilog << "FFTW_MEASURE" << std::endl;
+    music::ilog << "FFTW_MEASURE" << std::endl;
 #else
-	csoca::ilog << "FFTW_ESTIMATE" << std::endl;
+	music::ilog << "FFTW_ESTIMATE" << std::endl;
 #endif
    //--------------------------------------------------------------------
    // Initialise plug-ins
@ -170,7 +211,8 @@ int main( int argc, char** argv )
    {
        ic_generator::Initialise( the_config );
    }catch(...){
-        csoca::elog << "Problem during initialisation. See error(s) above. Exiting..." << std::endl;
+        handle_eptr( std::current_exception() );
+        music::elog << "Problem during initialisation. See error(s) above. Exiting..." << std::endl;
        #if defined(USE_MPI) 
        MPI_Finalize();
        #endif
@ -181,6 +223,8 @@ int main( int argc, char** argv )
    // do the job...
    ///////////////////////////////////////////////////////////////////////
    ic_generator::Run( the_config );
+
+    // particle::test_plt();
    ///////////////////////////////////////////////////////////////////////

 #if defined(USE_MPI)
@ -188,8 +232,8 @@ int main( int argc, char** argv )
    MPI_Finalize();
 #endif

-    csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
-    csoca::ilog << "Done." << std::endl;
+    music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+    music::ilog << "Done. Have a nice day!\n" << std::endl;

    return 0;
 }
--- a/src/output_plugin.cc
+++ b/src/output_plugin.cc
@ -23,31 +23,32 @@ void print_output_plugins()
 	
 	std::map< std::string, output_plugin_creator *>::iterator it;
 	it = m.begin();
-	csoca::ilog << "Available output plug-ins:\n";
+	music::ilog << "Available output plug-ins:\n";
 	while( it!=m.end() )
 	{
 		if( it->second )
-			csoca::ilog << "\t\'" << it->first << "\'\n";
+			music::ilog << "\t\'" << it->first << "\'\n";
 		++it;
 	}
+	music::ilog << std::endl;
 }

-std::unique_ptr<output_plugin> select_output_plugin( ConfigFile& cf )
+std::unique_ptr<output_plugin> select_output_plugin( config_file& cf )
 {
-	std::string formatname = cf.GetValue<std::string>( "output", "format" );
+	std::string formatname = cf.get_value<std::string>( "output", "format" );
 	
 	output_plugin_creator *the_output_plugin_creator 
 	= get_output_plugin_map()[ formatname ];
 	
 	if( !the_output_plugin_creator )
 	{	
-		csoca::elog << "Error: output plug-in \'" << formatname << "\' not found." << std::endl;
+		music::elog << "Output plug-in \'" << formatname << "\' not found." << std::endl;
 		print_output_plugins();
 		throw std::runtime_error("Unknown output plug-in");
 		
 	}else{
-		csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
-        csoca::ilog << std::setw(32) << std::left << "Output plugin" << " : " << formatname << std::endl;
+		music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+		music::ilog << std::setw(32) << std::left << "Output plugin" << " : " << formatname << std::endl;
 	}
 	
 	return std::move(the_output_plugin_creator->create( cf ));
--- a/src/plugins/output_arepo.cc
+++ b/src/plugins/output_arepo.cc
@ -0,0 +1,241 @@
+
+#ifdef USE_HDF5
+#include <unistd.h> // for unlink
+#include <output_plugin.hh>
+#include "HDF_IO.hh"
+
+template <typename T>
+std::vector<T> from_6array(const T *a)
+{
+  return std::vector<T>{{a[0], a[1], a[2], a[3], a[4], a[5]}};
+}
+
+template <typename T>
+std::vector<T> from_value(const T a)
+{
+  return std::vector<T>{{a}};
+}
+
+template <typename write_real_t>
+class gadget_hdf5_output_plugin : public output_plugin
+{
+  struct header_t
+  {
+    unsigned npart[6];
+    double mass[6];
+    double time;
+    double redshift;
+    int flag_sfr;
+    int flag_feedback;
+    unsigned int npartTotal[6];
+    int flag_cooling;
+    int num_files;
+    double BoxSize;
+    double Omega0;
+    double OmegaLambda;
+    double HubbleParam;
+    int flag_stellarage;
+    int flag_metals;
+    unsigned int npartTotalHighWord[6];
+    int flag_entropy_instead_u;
+    int flag_doubleprecision;
+  };
+
+protected:
+  int num_files_, num_simultaneous_writers_;
+  header_t header_;
+  real_t lunit_, vunit_;
+  bool blongids_;
+  std::string this_fname_;
+  double Tini_;
+  unsigned pmgrid_;
+  unsigned gridboost_;
+  int doublePrec_;
+  int doBaryons_;
+  double softening_;
+
+public:
+  //! constructor
+  explicit gadget_hdf5_output_plugin(config_file &cf)
+      : output_plugin(cf, "GADGET-HDF5")
+  {
+    num_files_ = 1;
+#ifdef USE_MPI
+    // use as many output files as we have MPI tasks
+    MPI_Comm_size(MPI_COMM_WORLD, &num_files_);
+#endif
+    real_t astart = 1.0 / (1.0 + cf_.get_value<double>("setup", "zstart"));
+    lunit_ = cf_.get_value<double>("setup", "BoxLength");
+    vunit_ = lunit_ / std::sqrt(astart);
+    blongids_ = cf_.get_value_safe<bool>("output", "UseLongids", false);
+    num_simultaneous_writers_ = cf_.get_value_safe<int>("output", "NumSimWriters", num_files_);
+
+    for (int i = 0; i < 6; ++i)
+    {
+      header_.npart[i] = 0;
+      header_.npartTotal[i] = 0;
+      header_.npartTotalHighWord[i] = 0;
+      header_.mass[i] = 0.0;
+    }
+
+    header_.time = astart;
+    header_.redshift = 1.0 / astart - 1.0;
+    header_.flag_sfr = 0;
+    header_.flag_feedback = 0;
+    header_.flag_cooling = 0;
+    header_.num_files = num_files_;
+    header_.BoxSize = lunit_;
+    header_.Omega0 = cf_.get_value<double>("cosmology", "Omega_m");
+    header_.OmegaLambda = cf_.get_value<double>("cosmology", "Omega_L");
+    header_.HubbleParam = cf_.get_value<double>("cosmology", "H0") / 100.0;
+    header_.flag_stellarage = 0;
+    header_.flag_metals = 0;
+    header_.flag_entropy_instead_u = 0;
+    header_.flag_doubleprecision = (typeid(write_real_t) == typeid(double)) ? true : false;
+
+    // initial gas temperature
+    double Tcmb0 = 2.726;
+    double Omegab = cf_.get_value<double>("cosmology", "Omega_b");
+    double h = cf_.get_value<double>("cosmology", "H0") / 100.0, h2 = h*h;
+    double adec = 1.0 / (160.0 * pow(Omegab * h2 / 0.022, 2.0 / 5.0));
+    Tini_ = astart < adec ? Tcmb0 / astart : Tcmb0 / astart / astart * adec;
+
+    // suggested PM res
+    pmgrid_ = 2*cf_.get_value<double>("setup", "GridRes");
+    gridboost_ = 1;
+    softening_ = cf_.get_value<double>("setup", "BoxLength")/pmgrid_/20;
+    doBaryons_ = cf_.get_value<bool>("setup", "DoBaryons");
+#if !defined(USE_SINGLEPRECISION)
+    doublePrec_ = 1;
+#else
+    doublePrec_ = 0;
+#endif
+
+    this_fname_ = fname_;
+#ifdef USE_MPI
+    int thisrank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &thisrank);
+    if (num_files_ > 1)
+      this_fname_ += "." + std::to_string(thisrank);
+#endif
+
+    unlink(this_fname_.c_str());
+    HDFCreateFile(this_fname_);
+  }
+
+  // use destructor to write header post factum
+  ~gadget_hdf5_output_plugin()
+  {
+    HDFCreateGroup(this_fname_, "Header");
+    HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<unsigned>(header_.npart));
+    HDFWriteGroupAttribute(this_fname_, "Header", "MassTable", from_6array<double>(header_.mass));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Time", from_value<double>(header_.time));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Redshift", from_value<double>(header_.redshift));
+    HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total", from_6array<unsigned>(header_.npartTotal));
+    HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total_HighWord", from_6array<unsigned>(header_.npartTotalHighWord));
+    HDFWriteGroupAttribute(this_fname_, "Header", "NumFilesPerSnapshot", from_value<int>(header_.num_files));
+    HDFWriteGroupAttribute(this_fname_, "Header", "BoxSize", from_value<double>(header_.BoxSize));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Omega0", from_value<double>(header_.Omega0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "OmegaLambda", from_value<double>(header_.OmegaLambda));
+    HDFWriteGroupAttribute(this_fname_, "Header", "HubbleParam", from_value<double>(header_.HubbleParam));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Sfr", from_value<int>(0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Cooling", from_value<int>(0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_StellarAge", from_value<int>(0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Metals", from_value<int>(0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Feedback", from_value<int>(0));
+    HDFWriteGroupAttribute(this_fname_, "Header", "Flag_DoublePrecision", (int)doublePrec_);
+    // HDFWriteGroupAttribute(this_fname_, "Header", "Music_levelmin", levelmin_);
+    // HDFWriteGroupAttribute(this_fname_, "Header", "Music_levelmax", levelmax_);
+    // HDFWriteGroupAttribute(this_fname_, "Header", "Music_levelcounts", levelcounts);
+    HDFWriteGroupAttribute(this_fname_, "Header", "haveBaryons", from_value<int>((int)doBaryons_));
+    HDFWriteGroupAttribute(this_fname_, "Header", "longIDs", from_value<int>((int)blongids_));
+    HDFWriteGroupAttribute(this_fname_, "Header", "suggested_pmgrid", from_value<int>(pmgrid_));
+    HDFWriteGroupAttribute(this_fname_, "Header", "suggested_gridboost", from_value<int>(gridboost_));
+    HDFWriteGroupAttribute(this_fname_, "Header", "suggested_highressoft", from_value<double>(softening_));
+    HDFWriteGroupAttribute(this_fname_, "Header", "suggested_gas_Tinit", from_value<double>(Tini_));
+
+    music::ilog << "Wrote" << std::endl;
+  }
+
+  output_type write_species_as(const cosmo_species &) const { return output_type::particles; }
+
+  real_t position_unit() const { return lunit_; }
+
+  real_t velocity_unit() const { return vunit_; }
+
+  bool has_64bit_reals() const
+  {
+    if (typeid(write_real_t) == typeid(double))
+      return true;
+    return false;
+  }
+
+  bool has_64bit_ids() const
+  {
+    if (blongids_)
+      return true;
+    return false;
+  }
+
+  int get_species_idx(const cosmo_species &s) const
+  {
+    switch (s)
+    {
+    case cosmo_species::dm:
+      return 1;
+    case cosmo_species::baryon:
+      return 0;
+    case cosmo_species::neutrino:
+      return 3;
+    }
+    return -1;
+  }
+
+  void write_particle_data(const particle::container &pc, const cosmo_species &s, double Omega_species)
+  {
+    int sid = get_species_idx(s);
+
+    assert(sid != -1);
+
+    header_.npart[sid] = (pc.get_local_num_particles());
+    header_.npartTotal[sid] = (uint32_t)(pc.get_global_num_particles());
+    header_.npartTotalHighWord[sid] = (uint32_t)((pc.get_global_num_particles()) >> 32);
+
+    double rhoc = 27.7519737; // in h^2 1e10 M_sol / Mpc^3
+    double boxmass = Omega_species * rhoc * std::pow(header_.BoxSize, 3);
+    header_.mass[sid] = boxmass / pc.get_global_num_particles();
+
+    HDFCreateGroup(this_fname_, std::string("PartType") + std::to_string(sid));
+
+    //... write positions and velocities.....
+    if (this->has_64bit_reals())
+    {
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Coordinates"), pc.positions64_);
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Velocities"), pc.velocities64_);
+    }
+    else
+    {
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Coordinates"), pc.positions32_);
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Velocities"), pc.velocities32_);
+    }
+
+    //... write ids.....
+    if (this->has_64bit_ids())
+      HDFWriteDataset(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/ParticleIDs"), pc.ids64_);
+    else
+      HDFWriteDataset(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/ParticleIDs"), pc.ids32_);
+
+    // std::cout << ">>>A> " << header_.npart[sid] << std::endl;
+  }
+};
+
+namespace
+{
+#if !defined(USE_SINGLEPRECISION)
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<double>> creator1("AREPO");
+#else
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<float>> creator1("AREPO");
+#endif
+} // namespace
+
+#endif
--- a/src/plugins/output_gadget2.cc
+++ b/src/plugins/output_gadget2.cc
@ -3,6 +3,7 @@

 constexpr int empty_fill_bytes{56};

+template <typename write_real_t>
 class gadget2_output_plugin : public output_plugin
 {
 public:
@ -33,32 +34,48 @@ protected:
 	int num_files_;
 	header this_header_;
 	real_t lunit_, vunit_;
+	bool blongids_;

 public:
 	//! constructor
-	explicit gadget2_output_plugin(ConfigFile &cf )
-	: output_plugin(cf, "GADGET-2")
+	explicit gadget2_output_plugin(config_file &cf)
+			: output_plugin(cf, "GADGET-2")
 	{
 		num_files_ = 1;
 #ifdef USE_MPI
 		// use as many output files as we have MPI tasks
 		MPI_Comm_size(MPI_COMM_WORLD, &num_files_);
 #endif
-		real_t astart = 1.0/(1.0+cf_.GetValue<double>("setup", "zstart"));
-		lunit_ = cf_.GetValue<double>("setup", "BoxLength");
+		real_t astart = 1.0 / (1.0 + cf_.get_value<double>("setup", "zstart"));
+		lunit_ = cf_.get_value<double>("setup", "BoxLength");
 		vunit_ = lunit_ / std::sqrt(astart);
+		blongids_ = cf_.get_value_safe<bool>("output", "UseLongids", false);
 	}

-    output_type write_species_as( const cosmo_species & ) const { return output_type::particles; }
+	output_type write_species_as(const cosmo_species &) const { return output_type::particles; }

 	real_t position_unit() const { return lunit_; }

 	real_t velocity_unit() const { return vunit_; }

-	void write_particle_data(const particle::container &pc, const cosmo_species &s )
+	bool has_64bit_reals() const
 	{
-			// fill the Gadget-2 header
-		memset(reinterpret_cast<void*>(&this_header_),0,sizeof(header));
+		if (typeid(write_real_t) == typeid(double))
+			return true;
+		return false;
+	}
+
+	bool has_64bit_ids() const
+	{
+		if (blongids_)
+			return true;
+		return false;
+	}
+
+	void write_particle_data(const particle::container &pc, const cosmo_species &s, double Omega_species)
+	{
+		// fill the Gadget-2 header
+		memset(reinterpret_cast<void *>(&this_header_), 0, sizeof(header));

 		for (int i = 0; i < 6; ++i)
 		{
@ -73,7 +90,7 @@ public:

 		/////
 		//... set time ......................................................
-		this_header_.redshift = cf_.GetValue<double>("setup", "zstart");
+		this_header_.redshift = cf_.get_value<double>("setup", "zstart");
 		this_header_.time = 1.0 / (1.0 + this_header_.redshift);

 		//... SF flags
@ -83,10 +100,10 @@ public:

 		//...
 		this_header_.num_files = num_files_; //1;
-		this_header_.BoxSize = cf_.GetValue<double>("setup", "BoxLength");
-		this_header_.Omega0 = cf_.GetValue<double>("cosmology", "Omega_m");
-		this_header_.OmegaLambda = cf_.GetValue<double>("cosmology", "Omega_L");
-		this_header_.HubbleParam = cf_.GetValue<double>("cosmology", "H0") / 100.0;
+		this_header_.BoxSize = cf_.get_value<double>("setup", "BoxLength");
+		this_header_.Omega0 = cf_.get_value<double>("cosmology", "Omega_m");
+		this_header_.OmegaLambda = cf_.get_value<double>("cosmology", "Omega_L");
+		this_header_.HubbleParam = cf_.get_value<double>("cosmology", "H0") / 100.0;

 		this_header_.flag_stellarage = 0;
 		this_header_.flag_metals = 0;
@ -100,50 +117,73 @@ public:

 		//... set masses
 		double rhoc = 27.7519737; // in h^2 1e10 M_sol / Mpc^3
-		double boxmass = this_header_.Omega0 * rhoc * std::pow(this_header_.BoxSize,3);
+		double boxmass = Omega_species * rhoc * std::pow(this_header_.BoxSize, 3);
 		this_header_.mass[1] = boxmass / pc.get_global_num_particles();

 		std::string fname = fname_;
 		int thisrank = 0;

 #ifdef USE_MPI
-		MPI_Comm_rank(MPI_COMM_WORLD,&thisrank);
-		if( num_files_ > 1 )
+		MPI_Comm_rank(MPI_COMM_WORLD, &thisrank);
+		if (num_files_ > 1)
 			fname += "." + std::to_string(thisrank);
 #endif
 		uint32_t blocksz;
 		std::ofstream ofs(fname.c_str(), std::ios::binary);

-		csoca::ilog << "Writer \'" << this->interface_name_ << "\' : Writing data for " << pc.get_global_num_particles() << " particles." << std::endl;
+		music::ilog << "Writer \'" << this->interface_name_ << "\' : Writing data for " << pc.get_global_num_particles() << " particles." << std::endl;

 		blocksz = sizeof(header);
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
-		ofs.write( reinterpret_cast<char*>(&this_header_), sizeof(header) );
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
+		ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+		ofs.write(reinterpret_cast<char *>(&this_header_), sizeof(header));
+		ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));

-		blocksz = 3 * sizeof(float) * pc.get_local_num_particles();
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
-		ofs.write( reinterpret_cast<const char*>(pc.get_pos_ptr()), blocksz );
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
+		// we write double precision
+		if (this->has_64bit_reals())
+		{
+			blocksz = 3 * sizeof(double) * pc.get_local_num_particles();
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_pos64_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));

-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
-		ofs.write( reinterpret_cast<const char*>(pc.get_vel_ptr()), blocksz );
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_vel64_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+		}
+		else
+		{
+			blocksz = 3 * sizeof(float) * pc.get_local_num_particles();
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_pos32_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));

-		blocksz = sizeof(float) * pc.get_local_num_particles();
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
-		ofs.write( reinterpret_cast<const char*>(pc.get_ids_ptr()), blocksz );
-		ofs.write( reinterpret_cast<char*>(&blocksz), sizeof(uint32_t) );
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_vel32_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+		}

+		// we write long IDs
+		if (this->has_64bit_ids())
+		{
+			blocksz = sizeof(uint64_t) * pc.get_local_num_particles();
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_ids64_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+		}
+		else
+		{
+			blocksz = sizeof(uint32_t) * pc.get_local_num_particles();
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+			ofs.write(reinterpret_cast<const char *>(pc.get_ids32_ptr()), blocksz);
+			ofs.write(reinterpret_cast<char *>(&blocksz), sizeof(uint32_t));
+		}
 	}
 };

-
 namespace
 {
-   output_plugin_creator_concrete<gadget2_output_plugin> creator1("gadget2"); 
-// output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2");
-// #ifndef SINGLE_PRECISION
-// output_plugin_creator_concrete<gadget2_output_plugin<double>> creator2("gadget2_double");
-// #endif
+output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2");
+#if !defined(USE_SINGLEPRECISION)
+output_plugin_creator_concrete<gadget2_output_plugin<double>> creator3("gadget2_double");
+#endif
 } // namespace
--- a/src/plugins/output_gadget_hdf5.cc
+++ b/src/plugins/output_gadget_hdf5.cc
@ -0,0 +1,210 @@
+
+#ifdef USE_HDF5
+#include <unistd.h> // for unlink
+#include <output_plugin.hh>
+#include "HDF_IO.hh"
+
+template <typename T>
+std::vector<T> from_6array(const T *a)
+{
+  return std::vector<T>{{a[0], a[1], a[2], a[3], a[4], a[5]}};
+}
+
+template <typename T>
+std::vector<T> from_value(const T a)
+{
+  return std::vector<T>{{a}};
+}
+
+template <typename write_real_t>
+class gadget_hdf5_output_plugin : public output_plugin
+{
+  struct header_t
+  {
+    unsigned npart[6];
+    double mass[6];
+    double time;
+    double redshift;
+    int flag_sfr;
+    int flag_feedback;
+    unsigned int npartTotal[6];
+    int flag_cooling;
+    int num_files;
+    double BoxSize;
+    double Omega0;
+    double OmegaLambda;
+    double HubbleParam;
+    int flag_stellarage;
+    int flag_metals;
+    unsigned int npartTotalHighWord[6];
+    int flag_entropy_instead_u;
+    int flag_doubleprecision;
+  };
+
+protected:
+  int num_files_, num_simultaneous_writers_;
+  header_t header_;
+  real_t lunit_, vunit_;
+  bool blongids_;
+  std::string this_fname_;
+
+public:
+  //! constructor
+  explicit gadget_hdf5_output_plugin(config_file &cf)
+      : output_plugin(cf, "GADGET-HDF5")
+  {
+    num_files_ = 1;
+#ifdef USE_MPI
+    // use as many output files as we have MPI tasks
+    MPI_Comm_size(MPI_COMM_WORLD, &num_files_);
+#endif
+    real_t astart = 1.0 / (1.0 + cf_.get_value<double>("setup", "zstart"));
+    lunit_ = cf_.get_value<double>("setup", "BoxLength");
+    vunit_ = lunit_ / std::sqrt(astart);
+    blongids_ = cf_.get_value_safe<bool>("output", "UseLongids", false);
+    num_simultaneous_writers_ = cf_.get_value_safe<int>("output", "NumSimWriters", num_files_);
+
+    for (int i = 0; i < 6; ++i)
+    {
+      header_.npart[i] = 0;
+      header_.npartTotal[i] = 0;
+      header_.npartTotalHighWord[i] = 0;
+      header_.mass[i] = 0.0;
+    }
+
+    header_.time = astart;
+    header_.redshift = 1.0 / astart - 1.0;
+    header_.flag_sfr = 0;
+    header_.flag_feedback = 0;
+    header_.flag_cooling = 0;
+    header_.num_files = num_files_;
+    header_.BoxSize = lunit_;
+    header_.Omega0 = cf_.get_value<double>("cosmology", "Omega_m");
+    header_.OmegaLambda = cf_.get_value<double>("cosmology", "Omega_L");
+    header_.HubbleParam = cf_.get_value<double>("cosmology", "H0") / 100.0;
+    header_.flag_stellarage = 0;
+    header_.flag_metals = 0;
+    header_.flag_entropy_instead_u = 0;
+    header_.flag_doubleprecision = (typeid(write_real_t) == typeid(double)) ? true : false;
+
+    this_fname_ = fname_;
+#ifdef USE_MPI
+    int thisrank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &thisrank);
+    if (num_files_ > 1)
+      this_fname_ += "." + std::to_string(thisrank);
+#endif
+
+    unlink(this_fname_.c_str());
+    HDFCreateFile(this_fname_);
+  }
+
+  // use destructor to write header post factum
+  ~gadget_hdf5_output_plugin()
+  {
+    if (!std::uncaught_exception()) 
+    {   
+      HDFCreateGroup(this_fname_, "Header");
+      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_ThisFile", from_6array<unsigned>(header_.npart));
+      HDFWriteGroupAttribute(this_fname_, "Header", "MassTable", from_6array<double>(header_.mass));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Time", from_value<double>(header_.time));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Redshift", from_value<double>(header_.redshift));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Sfr", from_value<int>(header_.flag_sfr));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Feedback", from_value<int>(header_.flag_feedback));
+      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total", from_6array<unsigned>(header_.npartTotal));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Cooling", from_value<int>(header_.flag_cooling));
+      HDFWriteGroupAttribute(this_fname_, "Header", "NumFilesPerSnapshot", from_value<int>(header_.num_files));
+      HDFWriteGroupAttribute(this_fname_, "Header", "BoxSize", from_value<double>(header_.BoxSize));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Omega0", from_value<double>(header_.Omega0));
+      HDFWriteGroupAttribute(this_fname_, "Header", "OmegaLambda", from_value<double>(header_.OmegaLambda));
+      HDFWriteGroupAttribute(this_fname_, "Header", "HubbleParam", from_value<double>(header_.HubbleParam));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_StellarAge", from_value<int>(header_.flag_stellarage));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Metals", from_value<int>(header_.flag_metals));
+      HDFWriteGroupAttribute(this_fname_, "Header", "NumPart_Total_HighWord", from_6array<unsigned>(header_.npartTotalHighWord));
+      HDFWriteGroupAttribute(this_fname_, "Header", "Flag_Entropy_ICs", from_value<int>(header_.flag_entropy_instead_u));
+
+      music::ilog << "Wrote Gadget-HDF5 file(s) to " << this_fname_ << std::endl;
+    }
+  }
+
+  output_type write_species_as(const cosmo_species &) const { return output_type::particles; }
+
+  real_t position_unit() const { return lunit_; }
+
+  real_t velocity_unit() const { return vunit_; }
+
+  bool has_64bit_reals() const
+  {
+    if (typeid(write_real_t) == typeid(double))
+      return true;
+    return false;
+  }
+
+  bool has_64bit_ids() const
+  {
+    if (blongids_)
+      return true;
+    return false;
+  }
+
+  int get_species_idx(const cosmo_species &s) const
+  {
+    switch (s)
+    {
+    case cosmo_species::dm:
+      return 1;
+    case cosmo_species::baryon:
+      return 0;
+    case cosmo_species::neutrino:
+      return 3;
+    }
+    return -1;
+  }
+
+  void write_particle_data(const particle::container &pc, const cosmo_species &s, double Omega_species)
+  {
+    int sid = get_species_idx(s);
+
+    assert(sid != -1);
+
+    header_.npart[sid] = (pc.get_local_num_particles());
+    header_.npartTotal[sid] = (uint32_t)(pc.get_global_num_particles());
+    header_.npartTotalHighWord[sid] = (uint32_t)((pc.get_global_num_particles()) >> 32);
+
+    double rhoc = 27.7519737; // in h^2 1e10 M_sol / Mpc^3
+    double boxmass = Omega_species * rhoc * std::pow(header_.BoxSize, 3);
+    header_.mass[sid] = boxmass / pc.get_global_num_particles();
+
+    HDFCreateGroup(this_fname_, std::string("PartType") + std::to_string(sid));
+
+    //... write positions and velocities.....
+    if (this->has_64bit_reals())
+    {
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Coordinates"), pc.positions64_);
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Velocities"), pc.velocities64_);
+    }
+    else
+    {
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Coordinates"), pc.positions32_);
+      HDFWriteDatasetVector(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/Velocities"), pc.velocities32_);
+    }
+
+    //... write ids.....
+    if (this->has_64bit_ids())
+      HDFWriteDataset(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/ParticleIDs"), pc.ids64_);
+    else
+      HDFWriteDataset(this_fname_, std::string("PartType") + std::to_string(sid) + std::string("/ParticleIDs"), pc.ids32_);
+
+    // std::cout << ">>>A> " << header_.npart[sid] << std::endl;
+  }
+};
+
+namespace
+{
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<float>> creator1("gadget_hdf5");
+#if !defined(USE_SINGLEPRECISION)
+output_plugin_creator_concrete<gadget_hdf5_output_plugin<double>> creator3("gadget_hdf5_double");
+#endif
+} // namespace
+
+#endif
--- a/src/plugins/output_generic.cc
+++ b/src/plugins/output_generic.cc
@ -21,13 +21,13 @@ protected:
 	bool out_eulerian_;
 public:
 	//! constructor
-	explicit generic_output_plugin(ConfigFile &cf )
+	explicit generic_output_plugin(config_file &cf )
 	: output_plugin(cf, "Generic HDF5")
 	{
-		real_t astart   = 1.0/(1.0+cf_.GetValue<double>("setup", "zstart"));
-		real_t boxsize  = cf_.GetValue<double>("setup", "BoxLength");
+		real_t astart   = 1.0/(1.0+cf_.get_value<double>("setup", "zstart"));
+		real_t boxsize  = cf_.get_value<double>("setup", "BoxLength");

-		out_eulerian_   = cf_.GetValueSafe<bool>("output", "generic_out_eulerian",false);
+		out_eulerian_   = cf_.get_value_safe<bool>("output", "generic_out_eulerian",false);

 		if( CONFIG::MPI_task_rank == 0 )
 		{
@ -50,6 +50,10 @@ public:
 		return output_type::field_lagrangian;
 	}

+	bool has_64bit_reals() const{ return true; }
+
+	bool has_64bit_ids() const{ return true; }
+
 	real_t position_unit() const { return 1.0; }
 	
 	real_t velocity_unit() const { return 1.0; }
@ -95,7 +99,7 @@ void generic_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
 {
 	std::string field_name = this->get_field_name( s, c );
 	g.Write_to_HDF5(fname_, field_name);
-	csoca::ilog << interface_name_ << " : Wrote field \'" << field_name << "\' to file \'" << fname_ << "\'" << std::endl;
+	music::ilog << interface_name_ << " : Wrote field \'" << field_name << "\' to file \'" << fname_ << "\'" << std::endl;
 }

 namespace
--- a/src/plugins/output_grafic2.cc
+++ b/src/plugins/output_grafic2.cc
@ -40,31 +40,31 @@ protected:

 public:
    //! constructor
-    explicit grafic2_output_plugin(ConfigFile &cf)
+    explicit grafic2_output_plugin(config_file &cf)
        : output_plugin(cf, "GRAFIC2/RAMSES")
    {
        lunit_ = 1.0;
        vunit_ = 1.0;

        double
-            boxlength = cf_.GetValue<double>("setup", "BoxLength"),
-            H0 = cf_.GetValue<double>("cosmology", "H0"),
-            zstart = cf_.GetValue<double>("setup", "zstart"),
+            boxlength = cf_.get_value<double>("setup", "BoxLength"),
+            H0 = cf_.get_value<double>("cosmology", "H0"),
+            zstart = cf_.get_value<double>("setup", "zstart"),
            astart = 1.0 / (1.0 + zstart),
-            omegam = cf_.GetValue<double>("cosmology", "Omega_m"),
-            omegaL = cf_.GetValue<double>("cosmology", "Omega_L");
-        uint32_t ngrid = cf_.GetValue<int>("setup", "GridRes");
+            omegam = cf_.get_value<double>("cosmology", "Omega_m"),
+            omegaL = cf_.get_value<double>("cosmology", "Omega_L");
+        uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");

-        bUseSPT_ = cf_.GetValueSafe<bool>("output", "grafic_use_SPT", false);
+        bUseSPT_ = cf_.get_value_safe<bool>("output", "grafic_use_SPT", false);
        levelmin_ = uint32_t(std::log2(double(ngrid)) + 1e-6);

        if (std::abs(std::pow(2.0, levelmin_) - double(ngrid)) > 1e-4)
        {
-            csoca::elog << interface_name_ << " plugin requires setup/GridRes to be power of 2!" << std::endl;
+            music::elog << interface_name_ << " plugin requires setup/GridRes to be power of 2!" << std::endl;
            abort();
        }

-        bhavebaryons_ = cf_.GetValueSafe<bool>("setup", "baryons", false);
+        bhavebaryons_ = cf_.get_value_safe<bool>("setup", "baryons", false);

        header_.n1 = ngrid;
        header_.n2 = ngrid;
@ -89,7 +89,7 @@ public:
        mkdir(dirname_.c_str(), 0777);

        // write RAMSES namelist file? if so only with one task
-        if (cf_.GetValueSafe<bool>("output", "ramses_nml", true) && CONFIG::MPI_task_rank==0 )
+        if (cf_.get_value_safe<bool>("output", "ramses_nml", true) && CONFIG::MPI_task_rank==0 )
        {
            write_ramses_namelist();
        }
@ -102,6 +102,10 @@ public:
        return output_type::field_lagrangian;
    }

+    bool has_64bit_reals() const{ return false; }
+
+	bool has_64bit_ids() const{ return false; }
+
    real_t position_unit() const { return lunit_; }

    real_t velocity_unit() const { return vunit_; }
@ -192,7 +196,7 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
            }

            // check field size against buffer size...
-            uint32_t ngrid = cf_.GetValue<int>("setup", "GridRes");
+            uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");
            assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
            assert( g.size(1) == ngrid && g.size(2) == ngrid);
            // write actual field slice by slice
@ -219,7 +223,7 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos

    } // end loop over write_rank

-    csoca::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
+    music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
 }

 void grafic2_output_plugin::write_ramses_namelist(void) const
@ -275,7 +279,7 @@ void grafic2_output_plugin::write_ramses_namelist(void) const
         << "m_refine=" << 1 + naddref << "*8.,\n"
         << "/\n";

-    csoca::ilog << interface_name_ << " wrote partial RAMSES namelist file \'" << fname_ << "\'" << std::endl;
+    music::ilog << interface_name_ << " wrote partial RAMSES namelist file \'" << fname_ << "\'" << std::endl;
 }

 namespace
--- a/src/plugins/random_music.cc
+++ b/src/plugins/random_music.cc
@ -34,29 +34,29 @@ protected:
  //void store_rnd(int ilevel, rng *prng);

 public:
-  explicit RNG_music(ConfigFile &cf) : RNG_plugin(cf), initialized_(false) {}
+  explicit RNG_music(config_file &cf) : RNG_plugin(cf), initialized_(false) {}

  ~RNG_music() {}

  bool isMultiscale() const { return true; }

-  void Fill_Grid( Grid_FFT<real_t>& g ) const { }
+  void Fill_Grid( Grid_FFT<real_t>& g ) {} //const { }

  void initialize_for_grid_structure()//const refinement_hierarchy &refh)
  {
    //prefh_ = &refh;
-    levelmin_ = pcf_->GetValue<unsigned>("setup", "levelmin");
-    levelmax_ = pcf_->GetValue<unsigned>("setup", "levelmax");
+    levelmin_ = pcf_->get_value<unsigned>("setup", "levelmin");
+    levelmax_ = pcf_->get_value<unsigned>("setup", "levelmax");

-    ran_cube_size_ = pcf_->GetValueSafe<unsigned>("random", "cubesize", DEF_RAN_CUBE_SIZE);
-    disk_cached_ = pcf_->GetValueSafe<bool>("random", "disk_cached", true);
-    restart_ = pcf_->GetValueSafe<bool>("random", "restart", false);
+    ran_cube_size_ = pcf_->get_value_safe<unsigned>("random", "cubesize", DEF_RAN_CUBE_SIZE);
+    disk_cached_ = pcf_->get_value_safe<bool>("random", "disk_cached", true);
+    restart_ = pcf_->get_value_safe<bool>("random", "restart", false);

    mem_cache_.assign(levelmax_ - levelmin_ + 1, (std::vector<real_t> *)NULL);

    if (restart_ && !disk_cached_)
    {
-      csoca::elog.Print("Cannot restart from mem cached random numbers.");
+      music::elog.Print("Cannot restart from mem cached random numbers.");
      throw std::runtime_error("Cannot restart from mem cached random numbers.");
    }

@ -93,8 +93,8 @@ void RNG_music::parse_random_parameters(void)
    std::string tempstr;
    bool noseed = false;
    sprintf(seedstr, "seed[%d]", i);
-    if (pcf_->ContainsKey("random", seedstr))
-      tempstr = pcf_->GetValue<std::string>("random", seedstr);
+    if (pcf_->contains_key("random", seedstr))
+      tempstr = pcf_->get_value<std::string>("random", seedstr);
    else
    {
      // "-2" means that no seed entry was found for that level
@ -105,7 +105,7 @@ void RNG_music::parse_random_parameters(void)
    if (is_number(tempstr))
    {
      long ltemp;
-      pcf_->Convert(tempstr, ltemp);
+      pcf_->convert(tempstr, ltemp);
      rngfnames_.push_back("");
      if (noseed) // ltemp < 0 )
        //... generate some dummy seed which only depends on the level, negative so we know it's not
@ -116,7 +116,7 @@ void RNG_music::parse_random_parameters(void)
      {
        if (ltemp <= 0)
        {
-          csoca::elog.Print("Specified seed [random]/%s needs to be a number >0!", seedstr);
+          music::elog.Print("Specified seed [random]/%s needs to be a number >0!", seedstr);
          throw std::runtime_error("Seed values need to be >0");
        }
        rngseeds_.push_back(ltemp);
@ -126,7 +126,7 @@ void RNG_music::parse_random_parameters(void)
    {
      rngfnames_.push_back(tempstr);
      rngseeds_.push_back(-1);
-      csoca::ilog.Print("Random numbers for level %3d will be read from file.", i);
+      music::ilog.Print("Random numbers for level %3d will be read from file.", i);
    }
  }

@ -141,7 +141,7 @@ void RNG_music::parse_random_parameters(void)

 void RNG_music::compute_random_numbers(void)
 {
-  bool rndsign = pcf_->GetValueSafe<bool>("random", "grafic_sign", false);
+  bool rndsign = pcf_->get_value_safe<bool>("random", "grafic_sign", false);

  std::vector<rng *> randc(std::max(levelmax_, levelmin_seed_) + 1, (rng *)NULL);

@ -160,7 +160,7 @@ void RNG_music::compute_random_numbers(void)
      //#warning add possibility to read noise from file also here!

      if (rngfnames_[i].size() > 0)
-        csoca::ilog.Print("Warning: Cannot use filenames for higher levels currently! Ignoring!");
+        music::ilog.Print("Warning: Cannot use filenames for higher levels currently! Ignoring!");

      randc[i] = new rng(*randc[i - 1], ran_cube_size_, rngseeds_[i], true);
      delete randc[i - 1];
@ -180,7 +180,7 @@ void RNG_music::compute_random_numbers(void)
    for (int ilevel = levelmin_seed_ - 1; ilevel >= (int)levelmin_; --ilevel)
    {
      if (rngseeds_[ilevel - levelmin_] > 0)
-        csoca::ilog.Print("Warning: random seed for level %d will be ignored.\n"
+        music::ilog.Print("Warning: random seed for level %d will be ignored.\n"
                "            consistency requires that it is obtained by restriction from level %d",
                ilevel, levelmin_seed_);

@ -227,11 +227,11 @@ void RNG_music::compute_random_numbers(void)
  // {
  //   int lx[3], x0[3];
  //   int shift[3], levelmin_poisson;
-  //   shift[0] = pcf_->GetValue<int>("setup", "shift_x");
-  //   shift[1] = pcf_->GetValue<int>("setup", "shift_y");
-  //   shift[2] = pcf_->GetValue<int>("setup", "shift_z");
+  //   shift[0] = pcf_->get_value<int>("setup", "shift_x");
+  //   shift[1] = pcf_->get_value<int>("setup", "shift_y");
+  //   shift[2] = pcf_->get_value<int>("setup", "shift_z");

-  //   levelmin_poisson = pcf_->GetValue<unsigned>("setup", "levelmin");
+  //   levelmin_poisson = pcf_->get_value<unsigned>("setup", "levelmin");

  //   int lfac = 1 << (ilevel - levelmin_poisson);

--- a/src/plugins/random_music_wnoise_generator.cc
+++ b/src/plugins/random_music_wnoise_generator.cc
@ -11,7 +11,7 @@ template <typename T>
 music_wnoise_generator<T>::music_wnoise_generator(unsigned res, unsigned cubesize, long baseseed, int *x0, int *lx)
    : res_(res), cubesize_(cubesize), ncubes_(1), baseseed_(baseseed)
 {
-  csoca::ilog.Print("Generating random numbers (1) with seed %ld", baseseed);
+  music::ilog.Print("Generating random numbers (1) with seed %ld", baseseed);

  initialize();
  fill_subvolume(x0, lx);
@ -21,7 +21,7 @@ template <typename T>
 music_wnoise_generator<T>::music_wnoise_generator(unsigned res, unsigned cubesize, long baseseed, bool zeromean)
    : res_(res), cubesize_(cubesize), ncubes_(1), baseseed_(baseseed)
 {
-  csoca::ilog.Print("Generating random numbers (2) with seed %ld", baseseed);
+  music::ilog.Print("Generating random numbers (2) with seed %ld", baseseed);

  double mean = 0.0;
  size_t res_l = res;
@ -31,7 +31,7 @@ music_wnoise_generator<T>::music_wnoise_generator(unsigned res, unsigned cubesiz
    cubesize_ = res_;

  if (!musicnoise)
-    csoca::elog.Print("This currently breaks compatibility. Need to disable by hand! Make sure to not check into repo");
+    music::elog.Print("This currently breaks compatibility. Need to disable by hand! Make sure to not check into repo");

  initialize();

@ -90,7 +90,7 @@ music_wnoise_generator<T>::music_wnoise_generator(unsigned res, std::string rand
  std::ifstream ifs(randfname.c_str(), std::ios::binary);
  if (!ifs)
  {
-    csoca::elog.Print("Could not open random number file \'%s\'!", randfname.c_str());
+    music::elog.Print("Could not open random number file \'%s\'!", randfname.c_str());
    throw std::runtime_error(std::string("Could not open random number file \'") + randfname + std::string("\'!"));
  }

@ -186,7 +186,7 @@ music_wnoise_generator<T>::music_wnoise_generator(unsigned res, std::string rand
  std::vector<float> in_float;
  std::vector<double> in_double;

-  csoca::ilog.Print("Random number file \'%s\'\n   contains %ld numbers. Reading...", randfname.c_str(), nx * ny * nz);
+  music::ilog.Print("Random number file \'%s\'\n   contains %ld numbers. Reading...", randfname.c_str(), nx * ny * nz);

  long double sum = 0.0, sum2 = 0.0;
  size_t count = 0;
@ -285,7 +285,7 @@ music_wnoise_generator<T>::music_wnoise_generator(unsigned res, std::string rand
  mean = sum / count;
  var = sum2 / count - mean * mean;

-  csoca::ilog.Print("Random numbers in file have \n     mean = %f and var = %f", mean, var);
+  music::ilog.Print("Random numbers in file have \n     mean = %f and var = %f", mean, var);
 }

 //... copy construct by averaging down
@ -298,7 +298,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
  long double sum = 0.0, sum2 = 0.0;
  size_t count = 0;

-  csoca::ilog.Print("Generating a coarse white noise field by k-space degrading");
+  music::ilog.Print("Generating a coarse white noise field by k-space degrading");
  //... initialize properties of container
  res_ = rc.res_ / 2;
  cubesize_ = res_;
@ -307,7 +307,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat

  if (sizeof(real_t) != sizeof(T))
  {
-    csoca::elog.Print("type mismatch with real_t in k-space averaging");
+    music::elog.Print("type mismatch with real_t in k-space averaging");
    throw std::runtime_error("type mismatch with real_t in k-space averaging");
  }

@ -405,7 +405,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
  rmean = sum / count;
  rvar = sum2 / count - rmean * rmean;

-  csoca::ilog.Print("Restricted random numbers have\n       mean = %f, var = %f", rmean, rvar);
+  music::ilog.Print("Restricted random numbers have\n       mean = %f, var = %f", rmean, rvar);
 }

 template <typename T>
@ -438,7 +438,7 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
  if (kspace)
  {

-    csoca::ilog.Print("Generating a constrained random number set with seed %ld\n    using coarse mode replacement...", baseseed);
+    music::ilog.Print("Generating a constrained random number set with seed %ld\n    using coarse mode replacement...", baseseed);
    assert(lx[0] % 2 == 0 && lx[1] % 2 == 0 && lx[2] % 2 == 0);
    size_t nx = lx[0], ny = lx[1], nz = lx[2],
           nxc = lx[0] / 2, nyc = lx[1] / 2, nzc = lx[2] / 2;
@ -573,7 +573,7 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
  }
  else
  {
-    csoca::ilog.Print("Generating a constrained random number set with seed %ld\n    using Hoffman-Ribak constraints...", baseseed);
+    music::ilog.Print("Generating a constrained random number set with seed %ld\n    using Hoffman-Ribak constraints...", baseseed);

    double fac = 1.0 / sqrt(8.0); //1./sqrt(8.0);

@ -613,7 +613,7 @@ void music_wnoise_generator<T>::register_cube(int i, int j, int k)
    rnums_.push_back(NULL);
    cubemap_[icube] = rnums_.size() - 1;
 #ifdef DEBUG
-    LOGDEBUG("registering new cube %d,%d,%d . ID = %ld, memloc = %ld", i, j, k, icube, cubemap_[icube]);
+    music::dlog.Print("registering new cube %d,%d,%d . ID = %ld, memloc = %ld", i, j, k, icube, cubemap_[icube]);
 #endif
  }
 }
@ -637,7 +637,7 @@ double music_wnoise_generator<T>::fill_cube(int i, int j, int k)

  if (it == cubemap_.end())
  {
-    csoca::elog.Print("Attempt to access non-registered random number cube!");
+    music::elog.Print("Attempt to access non-registered random number cube!");
    throw std::runtime_error("Attempt to access non-registered random number cube!");
  }

@ -674,7 +674,7 @@ void music_wnoise_generator<T>::subtract_from_cube(int i, int j, int k, double v

  if (it == cubemap_.end())
  {
-    csoca::elog.Print("Attempt to access unallocated RND cube %d,%d,%d in music_wnoise_generator::subtract_from_cube", i, j, k);
+    music::elog.Print("Attempt to access unallocated RND cube %d,%d,%d in music_wnoise_generator::subtract_from_cube", i, j, k);
    throw std::runtime_error("Attempt to access unallocated RND cube in music_wnoise_generator::subtract_from_cube");
  }

@ -700,7 +700,7 @@ void music_wnoise_generator<T>::free_cube(int i, int j, int k)

  if (it == cubemap_.end())
  {
-    csoca::elog.Print("Attempt to access unallocated RND cube %d,%d,%d in music_wnoise_generator::free_cube", i, j, k);
+    music::elog.Print("Attempt to access unallocated RND cube %d,%d,%d in music_wnoise_generator::free_cube", i, j, k);
    throw std::runtime_error("Attempt to access unallocated RND cube in music_wnoise_generator::free_cube");
  }

@ -724,7 +724,7 @@ void music_wnoise_generator<T>::initialize(void)
    cubesize_ = res_;
  }

-  csoca::ilog.Print("Generating random numbers w/ sample cube size of %d", cubesize_);
+  music::ilog.Print("Generating random numbers w/ sample cube size of %d", cubesize_);
 }

 template <typename T>
@ -741,8 +741,8 @@ double music_wnoise_generator<T>::fill_subvolume(int *i0, int *n)
  ncube[2] = (int)(n[2] / cubesize_) + 2;

 #ifdef DEBUG
-  LOGDEBUG("random numbers needed for region %d,%d,%d ..+ %d,%d,%d", i0[0], i0[1], i0[2], n[0], n[1], n[2]);
-  LOGDEBUG("filling cubes %d,%d,%d ..+ %d,%d,%d", i0cube[0], i0cube[1], i0cube[2], ncube[0], ncube[1], ncube[2]);
+  music::dlog.Print("random numbers needed for region %d,%d,%d ..+ %d,%d,%d", i0[0], i0[1], i0[2], n[0], n[1], n[2]);
+  music::dlog.Print("filling cubes %d,%d,%d ..+ %d,%d,%d", i0cube[0], i0cube[1], i0cube[2], ncube[0], ncube[1], ncube[2]);
 #endif

  double mean = 0.0;
@ -836,7 +836,7 @@ void music_wnoise_generator<T>::print_allocated(void)
    if (rnums_[i] != NULL)
      ncount++;

-  csoca::ilog.Print(" -> %d of %d random number cubes currently allocated", ncount, ntot);
+  music::ilog.Print(" -> %d of %d random number cubes currently allocated", ncount, ntot);
 }

 template class music_wnoise_generator<float>;
--- a/src/plugins/random_music_wnoise_generator.hh
+++ b/src/plugins/random_music_wnoise_generator.hh
@ -80,7 +80,7 @@ protected:

    if (it == cubemap_.end())
    {
-      csoca::elog.Print("attempting to copy data from non-existing RND cube %d,%d,%d", i, j, k);
+      music::elog.Print("attempting to copy data from non-existing RND cube %d,%d,%d", i, j, k);
      throw std::runtime_error("attempting to copy data from non-existing RND cube");
    }

@ -186,7 +186,7 @@ public:

    if (it == cubemap_.end())
    {
-      csoca::elog.Print("Attempting to copy data from non-existing RND cube %d,%d,%d @ %d,%d,%d", ic, jc, kc, i, j, k);
+      music::elog.Print("Attempting to copy data from non-existing RND cube %d,%d,%d @ %d,%d,%d", ic, jc, kc, i, j, k);
      throw std::runtime_error("attempting to copy data from non-existing RND cube");
    }

@ -194,7 +194,7 @@ public:

    if (rnums_[cubeidx] == NULL)
    {
-      csoca::elog.Print("Attempting to access data from non-allocated RND cube %d,%d,%d", ic, jc, kc);
+      music::elog.Print("Attempting to access data from non-allocated RND cube %d,%d,%d", ic, jc, kc);
      throw std::runtime_error("attempting to access data from non-allocated RND cube");
    }

--- a/src/plugins/random_ngenic.cc
+++ b/src/plugins/random_ngenic.cc
@ -18,11 +18,11 @@ private:
    std::vector<unsigned int> SeedTable_;

 public:
-    explicit RNG_ngenic(ConfigFile &cf) : RNG_plugin(cf)
+    explicit RNG_ngenic(config_file &cf) : RNG_plugin(cf)
    {

-        RandomSeed_ = cf.GetValue<long>("random", "seed");
-        nres_ = cf.GetValue<size_t>("setup", "GridRes");
+        RandomSeed_ = cf.get_value<long>("random", "seed");
+        nres_ = cf.get_value<size_t>("setup", "GridRes");
        pRandomGenerator_ = gsl_rng_alloc(gsl_rng_ranlxd1);
        gsl_rng_set(pRandomGenerator_, RandomSeed_);

@ -63,7 +63,7 @@ public:

    bool isMultiscale() const { return false; }

-    void Fill_Grid(Grid_FFT<real_t> &g) const
+    void Fill_Grid(Grid_FFT<real_t> &g) //const
    {
        g.zero();
        g.FourierTransformForward(false);
@ -82,7 +82,11 @@ public:
                for (size_t j = 0; j < nres_; ++j) 
                {                   
                    ptrdiff_t jj = (j>0)? nres_ - j : 0;
-                    gsl_rng_set( pRandomGenerator_, SeedTable_[i * nres_ + j]);
+                    if( g.is_distributed() )
+                        gsl_rng_set( pRandomGenerator_, SeedTable_[j * nres_ + i]);
+                    else
+                        gsl_rng_set( pRandomGenerator_, SeedTable_[i * nres_ + j]);
+                    
                    for (size_t k = 0; k < g.size(2); ++k) 
                    {
                        double phase = gsl_rng_uniform(pRandomGenerator_) * 2 * M_PI;
@ -101,15 +105,28 @@ public:
                        if (k > 0) {
                            if (i_in_range) g.kelem(ip,j,k) = zrand;
                        } else{ /* k=0 plane needs special treatment */
-                            if (i == 0) {
-                                if (j < nres_ / 2 && i_in_range)
-                                {
-                                    g.kelem(ip,j,k) = zrand;
-                                    g.kelem(ip,jj,k) = std::conj(zrand);
+                            if( g.is_distributed() ){
+                                if (j == 0) {
+                                    if (i < nres_ / 2 && i_in_range)
+                                    {
+                                        if(i_in_range) g.kelem(ip,jj,k) = zrand;
+                                        if(ii_in_range) g.kelem(iip,j,k) = std::conj(zrand);
+                                    }
+                                } else if (j < nres_ / 2) {
+                                    if(i_in_range) g.kelem(ip,j,k) = zrand;
+                                    if(ii_in_range) g.kelem(iip,jj,k) = std::conj(zrand);
+                                }
+                            }else{
+                                if (i == 0) {
+                                    if (j < nres_ / 2 && i_in_range)
+                                    {
+                                        g.kelem(ip,j,k) = zrand;
+                                        g.kelem(ip,jj,k) = std::conj(zrand);
+                                    }
+                                } else if (i < nres_ / 2) {
+                                    if(i_in_range) g.kelem(ip,j,k) = zrand;
+                                    if (ii_in_range) g.kelem(iip,jj,k) = std::conj(zrand);
                                }
-                            } else if (i < nres_ / 2) {
-                                if(i_in_range) g.kelem(ip,j,k) = zrand;
-                                if (ii_in_range) g.kelem(iip,jj,k) = std::conj(zrand);
                            }
                        }
                    }
--- a/src/plugins/random_panphasia.cc
+++ b/src/plugins/random_panphasia.cc
@ -0,0 +1,522 @@
+#if defined(USE_PANPHASIA)
+
+#include <general.hh>
+#include <random_plugin.hh>
+#include <config_file.hh>
+
+#include <vector>
+#include <cmath>
+#include <cstring>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <grid_fft.hh>
+
+const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
+typedef int rand_offset_[5];
+typedef struct
+{
+  int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
+  int need_fill;
+  int pos;
+} rand_state_;
+
+/* pan_state_ struct -- corresponds to respective fortran module in panphasia_routines.f
+ * data structure that contains all panphasia state variables
+ * it needs to get passed between the fortran routines to enable
+ * thread-safe execution.
+ */
+typedef struct
+{
+  int base_state[5], base_lev_start[5][maxdim + 1];
+  rand_offset_ poweroffset[maxpow + 1], superjump;
+  rand_state_ current_state[maxpow + 2];
+
+  int layer_min, layer_max, indep_field;
+
+  long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
+  int lev_common, layer_min_store, layer_max_store;
+  long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
+      iy_rel_store, iz_rel_store;
+  double exp_coeffs[8][8][maxdim + 2];
+  long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
+  int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
+
+  double cell_data[9][8];
+  int ixh_last, iyh_last, izh_last;
+  int init;
+
+  int init_cell_props;
+  int init_lecuyer_state;
+  long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
+
+} pan_state_;
+
+extern "C"
+{
+  void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
+
+  void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
+                         int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
+
+  void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
+
+  void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
+                                      int *layer_max, int *indep_field, double *cell_prop);
+
+  void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
+                                  long long *iy_rel, long long *iz_rel, int *VERBOSE);
+}
+
+struct panphasia_descriptor
+{
+  int16_t wn_level_base;
+  int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
+  int16_t i_base, i_base_y, i_base_z;
+  int32_t check_rand;
+  std::string name;
+
+  explicit panphasia_descriptor(std::string dstring)
+  {
+    char tmp[100];
+    std::memset(tmp, ' ', 100);
+    parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
+                      &i_base_y, &i_base_z, &check_rand, tmp);
+    for (int i = 0; i < 100; i++)
+      if (tmp[i] == ' ')
+      {
+        tmp[i] = '\0';
+        break;
+      }
+    name = tmp;
+    name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
+  }
+};
+
+// greatest common divisor
+int gcd(int a, int b)
+{
+  if (b == 0)
+    return a;
+  return gcd(b, a % b);
+}
+
+// least common multiple
+int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
+
+// Two or largest power of 2 less than the argument
+int largest_power_two_lte(int b)
+{
+  int a = 1;
+  if (b <= a)
+    return a;
+  while (2 * a < b)
+    a = 2 * a;
+  return a;
+}
+
+class RNG_panphasia : public RNG_plugin
+{
+private:
+protected:
+  std::string descriptor_string_;
+  int num_threads_;
+  int levelmin_, levelmin_final_, levelmax_, ngrid_;
+  bool incongruent_fields_;
+  double inter_grid_phase_adjustment_;
+  // double translation_phase_;
+  pan_state_ *lstate;
+  int grid_p_, grid_m_;
+  double grid_rescale_fac_;
+  int coordinate_system_shift_[3];
+  int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
+
+  void clear_panphasia_thread_states(void)
+  {
+    for (int i = 0; i < num_threads_; ++i)
+    {
+      lstate[i].init = 0;
+      lstate[i].init_cell_props = 0;
+      lstate[i].init_lecuyer_state = 0;
+    }
+  }
+
+  void initialize_for_grid_structure(void)
+  {
+    clear_panphasia_thread_states();
+    music::ilog.Print("PANPHASIA: running with %d threads", num_threads_);
+
+    // if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
+    ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
+
+    grid_p_ = pdescriptor_->i_base;
+    grid_m_ = largest_power_two_lte(grid_p_);
+
+    lextra_ = (log10((double)ngrid_ / (double)pdescriptor_->i_base) + 0.001) / log10(2.0);
+    int ratio = 1 << lextra_;
+    grid_rescale_fac_ = 1.0;
+
+    coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
+    coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
+    coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
+
+    incongruent_fields_ = false;
+    if (ngrid_ != ratio * pdescriptor_->i_base)
+    {
+      incongruent_fields_ = true;
+      ngrid_ = 2 * ratio * pdescriptor_->i_base;
+      grid_rescale_fac_ = (double)ngrid_ / (1 << levelmin_);
+      music::ilog << "PANPHASIA: will use a higher resolution (using Fourier interpolation)" << std::endl;
+      music::ilog << "     (" << grid_m_ << " -> " << grid_p_ << ") * 2**ref to be compatible with PANPHASIA" << std::endl;
+    }
+  }
+
+  std::unique_ptr<panphasia_descriptor> pdescriptor_;
+
+public:
+  explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf)
+  {
+    descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
+
+#ifdef _OPENMP
+    num_threads_ = omp_get_max_threads();
+#else
+    num_threads_ = 1;
+#endif
+
+    // create independent state descriptions for each thread
+    lstate = new pan_state_[num_threads_];
+
+    // parse the descriptor for its properties
+    pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
+
+    music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
+
+    // write panphasia base size into config file for the grid construction
+    // as the gridding unit we use the least common multiple of 2 and i_base
+    std::stringstream ss;
+    //ARJ  ss << lcm(2, pdescriptor_->i_base);
+    //ss <<  two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
+    ss << 2; //ARJ - set gridding unit to two
+    pcf_->insert_value("setup", "gridding_unit", ss.str());
+    ss.str(std::string());
+    ss << pdescriptor_->i_base;
+    pcf_->insert_value("random", "base_unit", ss.str());
+
+    this->initialize_for_grid_structure();
+  }
+
+  ~RNG_panphasia() { delete[] lstate; }
+
+  bool isMultiscale() const { return true; }
+
+  void Fill_Grid(Grid_FFT<real_t> &g)
+  {
+    auto sinc = [](real_t x) { return (std::abs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
+    auto dsinc = [](real_t x) { return (std::abs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
+    const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
+
+    // make sure we're in the right space
+    Grid_FFT<real_t> &g0 = g;
+    g0.FourierTransformBackward(false);
+
+    // temporaries
+    Grid_FFT<real_t> g1(g.n_, g.length_);
+    Grid_FFT<real_t> g2(g.n_, g.length_);
+    Grid_FFT<real_t> g3(g.n_, g.length_);
+    Grid_FFT<real_t> g4(g.n_, g.length_);
+
+    clear_panphasia_thread_states();
+    music::ilog.Print("PANPHASIA: running with %d threads", num_threads_);
+
+    ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
+
+    grid_p_ = pdescriptor_->i_base;
+    // grid_m_ = largest_power_two_lte(grid_p_);
+    if (ngrid_ % grid_p_ != 0)
+    {
+      music::elog << "Grid resolution " << ngrid_ << " is not divisible by PANPHASIA descriptor length " << grid_p_ << std::endl;
+      throw std::runtime_error("Chosen [setup] / GridRes is not compatible with PANPHASIA descriptor length!");
+    }
+
+    double t1 = get_wtime();
+    // double tp = t1;
+
+#pragma omp parallel
+    {
+#ifdef _OPENMP
+      const int mythread = omp_get_thread_num();
+#else
+      const int mythread = 0;
+#endif
+
+      //int odd_x, odd_y, odd_z;
+      //int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
+
+      int verbosity = (mythread == 0);
+      char descriptor[100];
+      std::memset(descriptor, 0, 100);
+      std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
+
+      start_panphasia_(&lstate[mythread], descriptor, &ngrid_, &verbosity);
+
+      {
+        panphasia_descriptor d(descriptor_string_);
+
+        int lextra = (log10((double)ngrid_ / (double)d.i_base) + 0.001) / log10(2.0);
+        int level_p = d.wn_level_base + lextra;
+        int ratio = 1 << lextra;
+
+        lstate[mythread].layer_min = 0;
+        lstate[mythread].layer_max = level_p;
+        lstate[mythread].indep_field = 1;
+
+        assert(ngrid_ == ratio * d.i_base);
+
+        long long ix_rel[3];
+        ix_rel[0] = 0; //ileft_corner_p[0];
+        ix_rel[1] = 0; //ileft_corner_p[1];
+        ix_rel[2] = 0; //ileft_corner_p[2];
+
+        set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
+                                   &verbosity);
+      }
+
+      if (verbosity)
+        t1 = get_wtime();
+
+      std::array<double, 9> cell_prop;
+      pan_state_ *ps = &lstate[mythread];
+
+#pragma omp for //nowait
+      for (size_t i = 0; i < g.size(0); i += 2)
+      {
+        for (size_t j = 0; j < g.size(1); j += 2)
+        {
+          for (size_t k = 0; k < g.size(2); k += 2)
+          {
+
+            // ARJ - added inner set of loops to speed up evaluation of Panphasia
+
+            for (int ix = 0; ix < 2; ++ix)
+            {
+              for (int iy = 0; iy < 2; ++iy)
+              {
+                for (int iz = 0; iz < 2; ++iz)
+                {
+                  int ilocal = i + ix;
+                  int jlocal = j + iy;
+                  int klocal = k + iz;
+
+                  int iglobal = ilocal + g.local_0_start_;
+                  int jglobal = jlocal;
+                  int kglobal = klocal;
+
+                  adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
+                                                 &ps->layer_max, &ps->indep_field, &cell_prop[0]);
+
+                  g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
+                  g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
+                  g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
+                  g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
+                  g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
+                }
+              }
+            }
+          }
+        }
+      }
+    } // end omp parallel region
+
+    g0.FourierTransformForward();
+    g1.FourierTransformForward();
+    g2.FourierTransformForward();
+    g3.FourierTransformForward();
+    g4.FourierTransformForward();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < g0.size(0); i++)
+    {
+      for (size_t j = 0; j < g0.size(1); j++)
+      {
+        for (size_t k = 0; k < g0.size(2); k++)
+        {
+          if (!g0.is_nyquist_mode(i, j, k))
+          {
+            auto kvec = g0.get_k<real_t>(i, j, k);
+
+            auto argx = 0.5 * M_PI * kvec[0] / g.kny_[0];
+            auto argy = 0.5 * M_PI * kvec[1] / g.kny_[1];
+            auto argz = 0.5 * M_PI * kvec[2] / g.kny_[2];
+
+            auto fx = sinc(argx);
+            auto gx = ccomplex_t(0.0, dsinc(argx));
+            auto fy = sinc(argy);
+            auto gy = ccomplex_t(0.0, dsinc(argy));
+            auto fz = sinc(argz);
+            auto gz = ccomplex_t(0.0, dsinc(argz));
+
+            auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
+            auto magnitude = std::sqrt(1.0 - std::abs(temp * temp));
+
+            auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
+
+            g0.kelem(i, j, k) = y0 * fx * fy * fz 
+                              + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) 
+                              + y4 * magnitude;
+          }
+          else
+          {
+            g0.kelem(i, j, k) = 0.0;
+          }
+        }
+      }
+    }
+
+    // music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
+    // tp = get_wtime();
+
+    g1.FourierTransformBackward(false);
+    g2.FourierTransformBackward(false);
+    g3.FourierTransformBackward(false);
+    g4.FourierTransformBackward(false);
+
+#pragma omp parallel
+    {
+#ifdef _OPENMP
+      const int mythread = omp_get_thread_num();
+#else
+      const int mythread = 0;
+#endif
+
+      // int odd_x, odd_y, odd_z;
+      int verbosity = (mythread == 0);
+      char descriptor[100];
+      std::memset(descriptor, 0, 100);
+      std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
+
+      start_panphasia_(&lstate[mythread], descriptor, &ngrid_, &verbosity);
+
+      {
+        panphasia_descriptor d(descriptor_string_);
+
+        int lextra = (log10((double)ngrid_ / (double)d.i_base) + 0.001) / log10(2.0);
+        int level_p = d.wn_level_base + lextra;
+        int ratio = 1 << lextra;
+
+        lstate[mythread].layer_min = 0;
+        lstate[mythread].layer_max = level_p;
+        lstate[mythread].indep_field = 1;
+
+        assert(ngrid_ == ratio * d.i_base);
+
+        long long ix_rel[3];
+        ix_rel[0] = 0; //ileft_corner_p[0];
+        ix_rel[1] = 0; //ileft_corner_p[1];
+        ix_rel[2] = 0; //ileft_corner_p[2];
+
+        set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
+                                   &verbosity);
+      }
+
+      if (verbosity)
+        t1 = get_wtime();
+
+      //***************************************************************
+      // Process Panphasia values: p110, p011, p101, p111
+      //****************************************************************
+      std::array<double,9> cell_prop;
+      pan_state_ *ps = &lstate[mythread];
+
+#pragma omp for //nowait
+      for (size_t i = 0; i < g1.size(0); i += 2)
+      {
+        for (size_t j = 0; j < g1.size(1); j += 2)
+        {
+          for (size_t k = 0; k < g1.size(2); k += 2)
+          {
+            // ARJ - added inner set of loops to speed up evaluation of Panphasia
+            for (int ix = 0; ix < 2; ++ix)
+            {
+              for (int iy = 0; iy < 2; ++iy)
+              {
+                for (int iz = 0; iz < 2; ++iz)
+                {
+                  int ilocal = i + ix;
+                  int jlocal = j + iy;
+                  int klocal = k + iz;
+
+                  int iglobal = ilocal + g.local_0_start_;
+                  int jglobal = jlocal;
+                  int kglobal = klocal;
+
+                  adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
+                                                 &ps->layer_max, &ps->indep_field, &cell_prop[0]);
+
+                  g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
+                  g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
+                  g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
+                  g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
+                }
+              }
+            }
+          }
+        }
+      }
+    } // end omp parallel region
+
+    // music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
+    // tp = get_wtime();
+
+    /////////////////////////////////////////////////////////////////////////
+    // transform and convolve with Legendres
+    g1.FourierTransformForward();
+    g2.FourierTransformForward();
+    g3.FourierTransformForward();
+    g4.FourierTransformForward();
+
+    #pragma omp parallel for 
+    for (size_t i = 0; i < g1.size(0); i++)
+    {
+      for (size_t j = 0; j < g1.size(1); j++)
+      {
+        for (size_t k = 0; k < g1.size(2); k++)
+        {
+          if (!g1.is_nyquist_mode(i, j, k))
+          {
+            auto kvec = g1.get_k<real_t>(i, j, k);
+
+            auto argx = 0.5 * M_PI * kvec[0] / g.kny_[0];
+            auto argy = 0.5 * M_PI * kvec[1] / g.kny_[1];
+            auto argz = 0.5 * M_PI * kvec[2] / g.kny_[2];
+
+            auto fx = sinc(argx);
+            auto gx = ccomplex_t(0.0, dsinc(argx));
+            auto fy = sinc(argy);
+            auto gy = ccomplex_t(0.0, dsinc(argy));
+            auto fz = sinc(argz);
+            auto gz = ccomplex_t(0.0, dsinc(argz));
+
+            auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
+
+            g0.kelem(i, j, k) += 3.0 * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
+          }
+        }
+      }
+    }
+
+    // music::ilog.Print("\033[31mtiming [build panphasia field2]: %f s\033[0m", get_wtime() - tp);
+    // tp = get_wtime();
+    music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
+                          1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
+    music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g0.mean(), g0.std());
+  }
+};
+
+namespace
+{
+  RNG_plugin_creator_concrete<RNG_panphasia> creator("PANPHASIA");
+}
+#endif // defined(USE_PANPHASIA)
--- a/src/plugins/transfer_CAMB_file.cc
+++ b/src/plugins/transfer_CAMB_file.cc
@ -0,0 +1,344 @@
+//  transfer_CAMB.cc - This file is part of MUSIC -
+//  a code to generate multi-scale initial conditions for cosmological simulations
+
+//  Copyright (C) 2019  Oliver Hahn
+
+#include <gsl/gsl_errno.h>
+#include <gsl/gsl_spline.h>
+
+#include <vector>
+
+#include "transfer_function_plugin.hh"
+
+const double tiny = 1e-30;
+
+class transfer_CAMB_file_plugin : public TransferFunction_plugin
+{
+
+private:
+  std::string m_filename_Pk, m_filename_Tk;
+  std::vector<double> m_tab_k, m_tab_Tk_tot, m_tab_Tk_cdm, m_tab_Tk_baryon;
+  std::vector<double> m_tab_Tvk_tot, m_tab_Tvk_cdm, m_tab_Tvk_baryon;
+  gsl_interp_accel *acc_tot, *acc_cdm, *acc_baryon;
+  gsl_interp_accel *acc_vtot, *acc_vcdm, *acc_vbaryon;
+  gsl_spline *spline_tot, *spline_cdm, *spline_baryon;
+  gsl_spline *spline_vtot, *spline_vcdm, *spline_vbaryon;
+
+  double m_kmin, m_kmax, m_Omega_b, m_Omega_m, m_zstart;
+  unsigned m_nlines;
+
+  bool m_linbaryoninterp;
+
+  void read_table(void)
+  {
+
+    m_nlines = 0;
+    m_linbaryoninterp = false;
+
+#ifdef WITH_MPI
+    if (MPI::COMM_WORLD.Get_rank() == 0)
+    {
+#endif
+      music::ilog.Print("Reading tabulated transfer function data from file \n    \'%s\'", m_filename_Tk.c_str());
+
+      std::string line;
+      std::ifstream ifs(m_filename_Tk.c_str());
+
+      if (!ifs.good())
+        throw std::runtime_error("Could not find transfer function file \'" + m_filename_Tk + "\'");
+
+      m_tab_k.clear();
+      m_tab_Tk_tot.clear();
+      m_tab_Tk_cdm.clear();
+      m_tab_Tk_baryon.clear();
+      m_tab_Tvk_tot.clear();
+      m_tab_Tvk_cdm.clear();    //>[150609SH: add]
+      m_tab_Tvk_baryon.clear(); //>[150609SH: add]
+
+      m_kmin = 1e30;
+      m_kmax = -1e30;
+      std::ofstream ofs("dump_transfer.txt");
+
+      while (!ifs.eof())
+      {
+        getline(ifs, line);
+        if (ifs.eof())
+          break;
+
+        // OH: ignore line if it has a comment:
+        if (line.find("#") != std::string::npos)
+          continue;
+
+        std::stringstream ss(line);
+
+        double k, Tkc, Tkb, Tktot, Tkvtot, Tkvc, Tkvb, dummy;
+
+        ss >> k;
+        ss >> Tkc;   // cdm
+        ss >> Tkb;   // baryon
+        ss >> dummy; // photon
+        ss >> dummy; // nu
+        ss >> dummy; // mass_nu
+        ss >> Tktot; // total
+        ss >> dummy; // no_nu
+        ss >> dummy; // total_de
+        ss >> dummy; // Weyl
+        ss >> Tkvc;  // v_cdm
+        ss >> Tkvb;  // v_b
+        ss >> dummy; // v_b-v_cdm
+
+        if (ss.bad() || ss.fail())
+        {
+          music::elog.Print("error reading the transfer function file (corrupt or not in expected format)!");
+          throw std::runtime_error("error reading transfer function file \'" +
+                                   m_filename_Tk + "\'");
+        }
+
+        if (m_Omega_b < 1e-6)
+          Tkvtot = Tktot;
+        else
+          Tkvtot = ((m_Omega_m - m_Omega_b) * Tkvc + m_Omega_b * Tkvb) / m_Omega_m; //MvD
+
+        m_linbaryoninterp |= Tkb < 0.0 || Tkvb < 0.0;
+
+        m_tab_k.push_back(log10(k));
+
+        m_tab_Tk_tot.push_back(Tktot);
+        m_tab_Tk_baryon.push_back(Tkb);
+        m_tab_Tk_cdm.push_back(Tkc);
+        m_tab_Tvk_tot.push_back(Tkvtot);
+        m_tab_Tvk_baryon.push_back(Tkvb);
+        m_tab_Tvk_cdm.push_back(Tkvc);
+
+        ++m_nlines;
+
+        if (k < m_kmin)
+          m_kmin = k;
+        if (k > m_kmax)
+          m_kmax = k;
+      }
+
+      for (size_t i = 0; i < m_tab_k.size(); ++i)
+      {
+        m_tab_Tk_tot[i] = log10(m_tab_Tk_tot[i]);
+        m_tab_Tk_cdm[i] = log10(m_tab_Tk_cdm[i]);
+        m_tab_Tvk_cdm[i] = log10(m_tab_Tvk_cdm[i]);
+        m_tab_Tvk_tot[i] = log10(m_tab_Tvk_tot[i]);
+
+        if (!m_linbaryoninterp)
+        {
+          m_tab_Tk_baryon[i] = log10(m_tab_Tk_baryon[i]);
+          m_tab_Tvk_baryon[i] = log10(m_tab_Tvk_baryon[i]);
+        }
+      }
+
+      ifs.close();
+
+      music::ilog.Print("Read CAMB transfer function table with %d rows", m_nlines);
+
+      if (m_linbaryoninterp)
+        music::ilog.Print("Using log-lin interpolation for baryons\n    (TF is not "
+                          "positive definite)");
+
+#ifdef WITH_MPI
+    }
+
+    unsigned n = m_tab_k.size();
+    MPI::COMM_WORLD.Bcast(&n, 1, MPI_UNSIGNED, 0);
+
+    if (MPI::COMM_WORLD.Get_rank() > 0)
+    {
+      m_tab_k.assign(n, 0);
+      m_tab_Tk_tot.assign(n, 0);
+      m_tab_Tk_cdm.assign(n, 0);
+      m_tab_Tk_baryon.assign(n, 0);
+      m_tab_Tvk_tot.assign(n, 0);
+      m_tab_Tvk_cdm.assign(n, 0);
+      m_tab_Tvk_baryon.assign(n, 0);
+    }
+
+    MPI::COMM_WORLD.Bcast(&m_tab_k[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tk_tot[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tk_cdm[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tk_baryon[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tvk_tot[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tvk_cdm[0], n, MPI_DOUBLE, 0);
+    MPI::COMM_WORLD.Bcast(&m_tab_Tvk_baryon[0], n, MPI_DOUBLE, 0);
+
+#endif
+  }
+
+public:
+  transfer_CAMB_file_plugin(config_file &cf)
+      : TransferFunction_plugin(cf)
+  {
+    m_filename_Tk = pcf_->get_value<std::string>("cosmology", "transfer_file");
+    m_Omega_m = cf.get_value<double>("cosmology", "Omega_m"); //MvD
+    m_Omega_b = cf.get_value<double>("cosmology", "Omega_b"); //MvD
+    m_zstart = cf.get_value<double>("setup", "zstart");       //MvD
+
+    read_table();
+
+    acc_tot = gsl_interp_accel_alloc();
+    acc_cdm = gsl_interp_accel_alloc();
+    acc_baryon = gsl_interp_accel_alloc();
+    acc_vtot = gsl_interp_accel_alloc();
+    acc_vcdm = gsl_interp_accel_alloc();
+    acc_vbaryon = gsl_interp_accel_alloc();
+
+    spline_tot = gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+    spline_cdm = gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+    spline_baryon = gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+    spline_vtot = gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+    spline_vcdm =
+        gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+    spline_vbaryon =
+        gsl_spline_alloc(gsl_interp_cspline, m_tab_k.size());
+
+    gsl_spline_init(spline_tot, &m_tab_k[0], &m_tab_Tk_tot[0], m_tab_k.size());
+    gsl_spline_init(spline_cdm, &m_tab_k[0], &m_tab_Tk_cdm[0], m_tab_k.size());
+    gsl_spline_init(spline_baryon, &m_tab_k[0], &m_tab_Tk_baryon[0],
+                    m_tab_k.size());
+    gsl_spline_init(spline_vtot, &m_tab_k[0], &m_tab_Tvk_tot[0],
+                    m_tab_k.size());
+    gsl_spline_init(spline_vcdm, &m_tab_k[0], &m_tab_Tvk_cdm[0],
+                    m_tab_k.size());
+    gsl_spline_init(spline_vbaryon, &m_tab_k[0], &m_tab_Tvk_baryon[0],
+                    m_tab_k.size());
+
+    tf_distinct_ = true; // different density between CDM v.s. Baryon
+    tf_withvel_ = true;  // using velocity transfer function
+  }
+
+  ~transfer_CAMB_file_plugin()
+  {
+    gsl_spline_free(spline_tot);
+    gsl_spline_free(spline_cdm);
+    gsl_spline_free(spline_baryon);
+    gsl_spline_free(spline_vtot);
+    gsl_spline_free(spline_vcdm);
+    gsl_spline_free(spline_vbaryon);
+
+    gsl_interp_accel_free(acc_tot);
+    gsl_interp_accel_free(acc_cdm);
+    gsl_interp_accel_free(acc_baryon);
+    gsl_interp_accel_free(acc_vtot);
+    gsl_interp_accel_free(acc_vcdm);
+    gsl_interp_accel_free(acc_vbaryon);
+  }
+
+  // linear interpolation in log-log
+  inline double extrap_right(double k, const tf_type &type) const
+  {
+    int n = m_tab_k.size() - 1, n1 = n - 1;
+
+    double v1(1.0), v2(1.0);
+
+    double lk = log10(k);
+    double dk = m_tab_k[n] - m_tab_k[n1];
+    double delk = lk - m_tab_k[n];
+
+    switch (type)
+    {
+    case cdm:
+      v1 = m_tab_Tk_cdm[n1];
+      v2 = m_tab_Tk_cdm[n];
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    case baryon:
+      v1 = m_tab_Tk_baryon[n1];
+      v2 = m_tab_Tk_baryon[n];
+      if (m_linbaryoninterp)
+        return std::max((v2 - v1) / dk * (delk) + v2, tiny);
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    case vtotal: //>[150609SH: add]
+      v1 = m_tab_Tvk_tot[n1];
+      v2 = m_tab_Tvk_tot[n];
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    case vcdm: //>[150609SH: add]
+      v1 = m_tab_Tvk_cdm[n1];
+      v2 = m_tab_Tvk_cdm[n];
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    case vbaryon: //>[150609SH: add]
+      v1 = m_tab_Tvk_baryon[n1];
+      v2 = m_tab_Tvk_baryon[n];
+      if (m_linbaryoninterp)
+        return std::max((v2 - v1) / dk * (delk) + v2, tiny);
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    case total:
+      v1 = m_tab_Tk_tot[n1];
+      v2 = m_tab_Tk_tot[n];
+      return pow(10.0, (v2 - v1) / dk * (delk) + v2);
+    default:
+      throw std::runtime_error(
+          "Invalid type requested in transfer function evaluation");
+    }
+
+    return 0.0;
+  }
+
+  inline double compute(double k, tf_type type) const
+  {
+    // use constant interpolation on the left side of the tabulated values
+    if (k < m_kmin)
+    {
+      switch (type)
+      {
+      case cdm:
+        return pow(10.0, m_tab_Tk_cdm[0]);
+      case baryon:
+        if (m_linbaryoninterp)
+          return m_tab_Tk_baryon[0];
+        return pow(10.0, m_tab_Tk_baryon[0]);
+      case vtotal:
+        return pow(10.0, m_tab_Tvk_tot[0]);
+      case vcdm:
+        return pow(10.0, m_tab_Tvk_cdm[0]);
+      case vbaryon:
+        if (m_linbaryoninterp)
+          return m_tab_Tvk_baryon[0];
+        return pow(10.0, m_tab_Tvk_baryon[0]);
+      case total:
+        return pow(10.0, m_tab_Tk_tot[0]);
+      default:
+        throw std::runtime_error(
+            "Invalid type requested in transfer function evaluation");
+      }
+    }
+    // use linear interpolation on the right side of the tabulated values
+    else if (k > m_kmax)
+      return extrap_right(k, type);
+
+    double lk = log10(k);
+    switch (type)
+    {
+    case cdm:
+      return pow(10.0, gsl_spline_eval(spline_cdm, lk, acc_cdm));
+    case baryon:
+      if (m_linbaryoninterp)
+        return gsl_spline_eval(spline_baryon, lk, acc_baryon);
+      return pow(10.0, gsl_spline_eval(spline_baryon, lk, acc_baryon));
+    case vtotal:
+      return pow(10.0, gsl_spline_eval(spline_vtot, lk, acc_vtot)); //MvD
+    case vcdm:
+      return pow(10.0, gsl_spline_eval(spline_vcdm, lk, acc_vcdm));
+    case vbaryon:
+      if (m_linbaryoninterp)
+        return gsl_spline_eval(spline_vbaryon, lk, acc_vbaryon);
+      return pow(10.0, gsl_spline_eval(spline_vbaryon, lk, acc_vbaryon));
+    case total:
+      return pow(10.0, gsl_spline_eval(spline_tot, lk, acc_tot));
+    default:
+      throw std::runtime_error(
+          "Invalid type requested in transfer function evaluation");
+    }
+  }
+
+  inline double get_kmin(void) const { return pow(10.0, m_tab_k[1]); }
+
+  inline double get_kmax(void) const { return pow(10.0, m_tab_k[m_tab_k.size() - 2]); }
+};
+
+namespace
+{
+TransferFunction_plugin_creator_concrete<transfer_CAMB_file_plugin> creator("CAMB_file");
+}
--- a/src/plugins/transfer_CLASS.cc
+++ b/src/plugins/transfer_CLASS.cc
@ -9,145 +9,328 @@
 #include <string>
 #include <vector>
 #include <memory>
+#include <sstream>

 #include <ClassEngine.hh>

 #include <general.hh>
 #include <config_file.hh>
 #include <transfer_function_plugin.hh>
+#include <math/interpolate.hh>

-#include <gsl/gsl_errno.h>
-#include <gsl/gsl_spline.h>
-
-class transfer_CLASS_plugin : public TransferFunction_plugin {
+class transfer_CLASS_plugin : public TransferFunction_plugin
+{

 private:
-    std::vector<double> tab_lnk_, tab_dtot_, tab_dc_, tab_db_, tab_ttot_, tab_tc_, tab_tb_;
-    gsl_interp_accel *gsl_ia_dtot_, *gsl_ia_dc_, *gsl_ia_db_, *gsl_ia_ttot_, *gsl_ia_tc_, *gsl_ia_tb_;
-    gsl_spline *gsl_sp_dtot_, *gsl_sp_dc_, *gsl_sp_db_, *gsl_sp_ttot_, *gsl_sp_tc_, *gsl_sp_tb_;
-    double Omega_m_, Omega_b_, N_ur_, zstart_, ztarget_, kmax_, kmin_, h_;
+  interpolated_function_1d<true, true, false> delta_c_, delta_b_, delta_n_, delta_m_, theta_c_, theta_b_, theta_n_, theta_m_;
+  interpolated_function_1d<true, true, false> delta_c0_, delta_b0_, delta_n0_, delta_m0_, theta_c0_, theta_b0_, theta_n0_, theta_m0_;

-    void ClassEngine_get_data( void ){
-        std::vector<double> d_ncdm, t_ncdm, phi, psi;
+  // single fluid growing/decaying mode decomposition
+  // gsl_interp_accel *gsl_ia_Cplus_, *gsl_ia_Cminus_;
+  // gsl_spline *gsl_sp_Cplus_, *gsl_sp_Cminus_;
+  // std::vector<double> tab_Cplus_, tab_Cminus_;

-        csoca::ilog << "Computing TF via ClassEngine..." << std::endl << " ztarget = " << ztarget_ << ", zstart = " << zstart_ << " ..." << std::flush;
-        double wtime = get_wtime();
+  double Omega_m_, Omega_b_, N_ur_, zstart_, ztarget_, kmax_, kmin_, h_, astart_, atarget_, A_s_, n_s_, sigma8_, Tcmb_, tnorm_;

-        ClassParams pars;
-        pars.add("extra metric transfer functions", "yes");
-        pars.add("z_pk",ztarget_);
-        pars.add("P_k_max_h/Mpc", kmax_);
-        pars.add("h",h_);
-        pars.add("Omega_b",Omega_b_);
-        // pars.add("Omega_k",0.0);
-        // pars.add("Omega_ur",0.0);
-        pars.add("N_ur",N_ur_);
-        pars.add("Omega_cdm",Omega_m_-Omega_b_);
-        pars.add("Omega_Lambda",1.0-Omega_m_);
-        // pars.add("Omega_fld",0.0);
-        // pars.add("Omega_scf",0.0);
-        pars.add("A_s",2.42e-9);
-        pars.add("n_s",.96); // tnis doesn't matter for TF
-        pars.add("output","dTk,vTk");
-        pars.add("YHe",0.248);
+  ClassParams pars_;
+  std::unique_ptr<ClassEngine> the_ClassEngine_;
+  std::ofstream ofs_class_input_;

-        pars.add("k_per_decade_for_pk",50);
-        pars.add("k_per_decade_for_bao",50);
-        pars.add("compute damping scale","yes");
-        pars.add("z_reio",-1.0); // make sure reionisation is not included
+  template <typename T>
+  void add_class_parameter(std::string parameter_name, const T parameter_value)
+  {
+    pars_.add(parameter_name, parameter_value);
+    ofs_class_input_ << parameter_name << " = " << parameter_value << std::endl;
+  }

-        std::unique_ptr<ClassEngine> CE = std::make_unique<ClassEngine>(pars, false);
+  //! Set up class parameters from MUSIC cosmological parameters
+  void init_ClassEngine(void)
+  {
+    //--- general parameters ------------------------------------------
+    add_class_parameter("z_max_pk", std::max(std::max(zstart_, ztarget_),199.0)); // use 1.2 as safety
+    add_class_parameter("P_k_max_h/Mpc", kmax_);
+    add_class_parameter("output", "dTk,vTk");
+    add_class_parameter("extra metric transfer functions","yes");
+    // add_class_parameter("lensing", "no");

-        CE->getTk(ztarget_, tab_lnk_, tab_dc_, tab_db_, d_ncdm, tab_dtot_,
-                tab_tc_, tab_tb_, t_ncdm, tab_ttot_, phi, psi );
+    //--- choose gauge ------------------------------------------------
+    // add_class_parameter("extra metric transfer functions", "yes");
+    add_class_parameter("gauge", "synchronous");

-        wtime = get_wtime() - wtime;
-        csoca::ilog << "   took " << wtime << " s / " << tab_lnk_.size() << " modes."  << std::endl;
+    //--- cosmological parameters, densities --------------------------
+    add_class_parameter("h", h_);
+
+    add_class_parameter("Omega_b", Omega_b_);
+    add_class_parameter("Omega_cdm", Omega_m_ - Omega_b_);
+    add_class_parameter("Omega_k", 0.0);
+    // add_class_parameter("Omega_Lambda",1.0-Omega_m_);
+    add_class_parameter("Omega_fld", 0.0);
+    add_class_parameter("Omega_scf", 0.0);
+    // add_class_parameter("fluid_equation_of_state","CLP");
+    // add_class_parameter("w0_fld", -1 );
+    // add_class_parameter("wa_fld", 0. );
+    // add_class_parameter("cs2_fld", 1);
+
+    //--- massive neutrinos -------------------------------------------
+#if 1
+    //default off
+    // add_class_parameter("Omega_ur",0.0);
+    add_class_parameter("N_ur", N_ur_);
+    add_class_parameter("N_ncdm", 0);
+
+#else
+    // change above to enable
+    add_class_parameter("N_ur", 0);
+    add_class_parameter("N_ncdm", 1);
+    add_class_parameter("m_ncdm", "0.4");
+    add_class_parameter("T_ncdm", 0.71611);
+#endif
+
+    //--- cosmological parameters, primordial -------------------------
+    add_class_parameter("P_k_ini type", "analytic_Pk");
+
+    if( A_s_ > 0.0 ){
+      add_class_parameter("A_s", A_s_);
+    }else{
+      add_class_parameter("sigma8", sigma8_);
    }
+    add_class_parameter("n_s", n_s_);
+    add_class_parameter("alpha_s", 0.0);
+    add_class_parameter("T_cmb", Tcmb_);
+    add_class_parameter("YHe", 0.248);
+
+    // precision parameters
+    add_class_parameter("k_per_decade_for_pk", 100);
+    add_class_parameter("k_per_decade_for_bao", 100);
+    add_class_parameter("compute damping scale", "yes");
+    add_class_parameter("tol_perturb_integration", 1.e-8);
+    add_class_parameter("tol_background_integration", 1e-9);
+
+    // high precision options from cl_permille.pre:
+    // precision file to be passed as input in order to achieve at least percent precision on scalar Cls
+    add_class_parameter("hyper_flat_approximation_nu", 7000.);
+    add_class_parameter("transfer_neglect_delta_k_S_t0", 0.17);
+    add_class_parameter("transfer_neglect_delta_k_S_t1", 0.05);
+    add_class_parameter("transfer_neglect_delta_k_S_t2", 0.17);
+    add_class_parameter("transfer_neglect_delta_k_S_e", 0.13);
+    add_class_parameter("delta_l_max", 1000);
+
+    int class_verbosity = 0;
+
+    add_class_parameter("background_verbose", class_verbosity);
+    add_class_parameter("thermodynamics_verbose", class_verbosity);
+    add_class_parameter("perturbations_verbose", class_verbosity);
+    add_class_parameter("transfer_verbose", class_verbosity);
+    add_class_parameter("primordial_verbose", class_verbosity);
+    add_class_parameter("spectra_verbose", class_verbosity);
+    add_class_parameter("nonlinear_verbose", class_verbosity);
+    add_class_parameter("lensing_verbose", class_verbosity);
+    add_class_parameter("output_verbose", class_verbosity);
+
+    // output parameters, only needed for the control CLASS .ini file that we output
+    std::stringstream zlist;
+    if (ztarget_ == zstart_)
+      zlist << ztarget_ << ((ztarget_!=0.0)? ", 0.0" : "");
+    else
+      zlist << std::max(ztarget_, zstart_) << ", " << std::min(ztarget_, zstart_) << ", 0.0";
+    add_class_parameter("z_pk", zlist.str());
+
+    music::ilog << "Computing transfer function via ClassEngine..." << std::endl;
+    double wtime = get_wtime();
+
+    the_ClassEngine_ = std::move(std::make_unique<ClassEngine>(pars_, false));
+
+    wtime = get_wtime() - wtime;
+    music::ilog << "CLASS took " << wtime << " s." << std::endl;
+  }
+
+  //! run ClassEngine with parameters set up
+  void run_ClassEngine(double z, std::vector<double> &k, std::vector<double> &dc, std::vector<double> &tc, std::vector<double> &db, std::vector<double> &tb,
+                       std::vector<double> &dn, std::vector<double> &tn, std::vector<double> &dm, std::vector<double> &tm)
+  {
+    k.clear(); 
+    dc.clear(); db.clear(); dn.clear(); dm.clear();
+    tc.clear(); tb.clear(); tn.clear(); tm.clear();
+    
+    the_ClassEngine_->getTk(z, k, dc, db, dn, dm, tc, tb, tn, tm);
+
+    real_t fc = (Omega_m_ - Omega_b_) / Omega_m_;
+    real_t fb = Omega_b_ / Omega_m_;
+
+    for (size_t i = 0; i < k.size(); ++i)
+    {
+      // convert to 'CAMB' format, since we interpolate loglog and
+      // don't want negative numbers...
+      auto ik2 = 1.0 / (k[i] * k[i]) * h_ * h_;
+      dc[i] = -dc[i] * ik2;
+      db[i] = -db[i] * ik2;
+      dn[i] = -dn[i] * ik2;
+      dm[i] = fc * dc[i] + fb * db[i];
+      tc[i] = -tc[i] * ik2;
+      tb[i] = -tb[i] * ik2;
+      tn[i] = -tn[i] * ik2;
+      tm[i] = fc * tc[i] + fb * tb[i];
+    }
+  }

 public:
-  explicit transfer_CLASS_plugin( ConfigFile &cf)
-  : TransferFunction_plugin(cf)
+  explicit transfer_CLASS_plugin(config_file &cf)
+      : TransferFunction_plugin(cf)
  {
-    h_       = pcf_->GetValue<double>("cosmology","H0") / 100.0; 
-    Omega_m_ = pcf_->GetValue<double>("cosmology","Omega_m"); 
-    Omega_b_ = pcf_->GetValue<double>("cosmology","Omega_b");
-    N_ur_    = pcf_->GetValueSafe<double>("cosmology","N_ur", 3.046);
-    ztarget_ = pcf_->GetValueSafe<double>("cosmology","ztarget",0.0);
-    zstart_  = pcf_->GetValue<double>("setup","zstart");
-    double lbox = pcf_->GetValue<double>("setup","BoxLength");
-    int nres = pcf_->GetValue<double>("setup","GridRes");
-    kmax_    = 2.0*M_PI/lbox * nres/2 * sqrt(3) * 2.0; // 120% of spatial diagonal
+    this->tf_isnormalised_ = true;

-    this->ClassEngine_get_data();
+    ofs_class_input_.open("input_class_parameters.ini", std::ios::trunc);

-    gsl_ia_dtot_ = gsl_interp_accel_alloc();
-    gsl_ia_dc_   = gsl_interp_accel_alloc();
-    gsl_ia_db_   = gsl_interp_accel_alloc();
-    gsl_ia_ttot_ = gsl_interp_accel_alloc();
-    gsl_ia_tc_   = gsl_interp_accel_alloc();
-    gsl_ia_tb_   = gsl_interp_accel_alloc();
+    h_ = pcf_->get_value<double>("cosmology", "H0") / 100.0;
+    Omega_m_ = pcf_->get_value<double>("cosmology", "Omega_m");
+    Omega_b_ = pcf_->get_value<double>("cosmology", "Omega_b");
+    N_ur_ = pcf_->get_value_safe<double>("cosmology", "Neff", 3.046);
+    ztarget_ = pcf_->get_value_safe<double>("cosmology", "ztarget", 0.0);
+    atarget_ = 1.0 / (1.0 + ztarget_);
+    zstart_ = pcf_->get_value<double>("setup", "zstart");
+    astart_ = 1.0 / (1.0 + zstart_);
+    A_s_ = pcf_->get_value_safe<double>("cosmology", "A_s", -1.0);
+    n_s_ = pcf_->get_value<double>("cosmology", "nspec");
+    Tcmb_ = cf.get_value_safe<double>("cosmology", "Tcmb", 2.7255);

-    gsl_sp_dtot_ = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
-    gsl_sp_dc_   = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
-    gsl_sp_db_   = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
-    gsl_sp_ttot_ = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
-    gsl_sp_tc_   = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
-    gsl_sp_tb_   = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
+    if (A_s_ > 0) {
+      music::ilog << "CLASS: Using A_s=" << A_s_<< " to normalise the transfer function." << std::endl;
+    }else{
+      sigma8_ = pcf_->get_value_safe<double>("cosmology", "sigma_8", -1.0);
+      if( sigma8_ < 0 ){
+        throw std::runtime_error("Need to specify either A_s or sigma_8 for CLASS plugin...");
+      }
+      music::ilog << "CLASS: Using sigma8_ =" << sigma8_<< " to normalise the transfer function." << std::endl;
+    }

-    gsl_spline_init(gsl_sp_dtot_, &tab_lnk_[0], &tab_dtot_[0], tab_lnk_.size());
-    gsl_spline_init(gsl_sp_dc_,   &tab_lnk_[0], &tab_dc_[0],   tab_lnk_.size());
-    gsl_spline_init(gsl_sp_db_,   &tab_lnk_[0], &tab_db_[0],   tab_lnk_.size());
-    gsl_spline_init(gsl_sp_ttot_, &tab_lnk_[0], &tab_ttot_[0], tab_lnk_.size());
-    gsl_spline_init(gsl_sp_tc_,   &tab_lnk_[0], &tab_tc_[0],   tab_lnk_.size());
-    gsl_spline_init(gsl_sp_tb_,   &tab_lnk_[0], &tab_tb_[0],   tab_lnk_.size());
+    // determine highest k we will need for the resolution selected
+    double lbox = pcf_->get_value<double>("setup", "BoxLength");
+    int nres = pcf_->get_value<double>("setup", "GridRes");
+    kmax_ = std::max(20.0, 2.0 * M_PI / lbox * nres / 2 * sqrt(3) * 2.0); // 120% of spatial diagonal, or k=10h Mpc-1

-    kmin_ = std::exp(tab_lnk_[0]);
+    // initialise CLASS and get the normalisation
+    this->init_ClassEngine();
+    A_s_ = the_ClassEngine_->get_A_s(); // this either the input one, or the one computed from sigma8
+    
+    // compute the normalisation to interface with MUSIC
+    double k_p = pcf_->get_value_safe<double>("cosmology", "k_p", 0.05);
+    tnorm_ = std::sqrt(2.0 * M_PI * M_PI * A_s_ * std::pow(1.0 / k_p * h_, n_s_ - 1) / std::pow(2.0 * M_PI, 3.0));
+
+    // compute the transfer function at z=0 using CLASS engine
+    std::vector<double> k, dc, tc, db, tb, dn, tn, dm, tm;
+    this->run_ClassEngine(0.0, k, dc, tc, db, tb, dn, tn, dm, tm);
+
+    delta_c0_.set_data(k, dc);
+    theta_c0_.set_data(k, tc);
+    delta_b0_.set_data(k, db);
+    theta_b0_.set_data(k, tb);
+    delta_n0_.set_data(k, dn);
+    theta_n0_.set_data(k, tn);
+    delta_m0_.set_data(k, dm);
+    theta_m0_.set_data(k, tm);
+
+     // compute the transfer function at z=z_target using CLASS engine
+    this->run_ClassEngine(ztarget_, k, dc, tc, db, tb, dn, tn, dm, tm);
+    delta_c_.set_data(k, dc);
+    theta_c_.set_data(k, tc);
+    delta_b_.set_data(k, db);
+    theta_b_.set_data(k, tb);
+    delta_n_.set_data(k, dn);
+    theta_n_.set_data(k, tn);
+    delta_m_.set_data(k, dm);
+    theta_m_.set_data(k, tm);
+
+    kmin_ = k[0];
+    kmax_ = k.back();
+
+    music::ilog << "CLASS table contains k = " << this->get_kmin() << " to " << this->get_kmax() << " h Mpc-1." << std::endl;
+
+    //--------------------------------------------------------------------------
+    // single fluid growing/decaying mode decomposition
+    //--------------------------------------------------------------------------
+    /*gsl_ia_Cplus_ = gsl_interp_accel_alloc();
+    gsl_ia_Cminus_ = gsl_interp_accel_alloc();
+
+    gsl_sp_Cplus_ = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
+    gsl_sp_Cminus_ = gsl_spline_alloc(gsl_interp_cspline, tab_lnk_.size());
+
+    tab_Cplus_.assign(tab_lnk_.size(), 0);
+    tab_Cminus_.assign(tab_lnk_.size(), 0);
+
+    std::ofstream ofs("grow_decay.txt");
+
+    for (size_t i = 0; i < tab_lnk_.size(); ++i)
+    {
+      tab_Cplus_[i] = (3.0 / 5.0 * tab_dtot_[i] / atarget_ - 2.0 / 5.0 * tab_ttot_[i] / atarget_);
+      tab_Cminus_[i] = (2.0 / 5.0 * std::pow(atarget_, 1.5) * (tab_dtot_[i] + tab_ttot_[i]));
+
+      ofs << std::exp(tab_lnk_[i]) << " " << tab_Cplus_[i] << " " << tab_Cminus_[i] << " " << tab_dtot_[i] << " " << tab_ttot_[i] << std::endl;
+    }
+
+    gsl_spline_init(gsl_sp_Cplus_, &tab_lnk_[0], &tab_Cplus_[0], tab_lnk_.size());
+    gsl_spline_init(gsl_sp_Cminus_, &tab_lnk_[0], &tab_Cminus_[0], tab_lnk_.size());*/
+    //--------------------------------------------------------------------------

    tf_distinct_ = true;
-    tf_withvel_  = true; 
+    tf_withvel_ = true;
+    tf_withtotal0_ = true;
  }

-  ~transfer_CLASS_plugin(){
-    gsl_spline_free(gsl_sp_dtot_);
-    gsl_spline_free(gsl_sp_dc_);
-    gsl_spline_free(gsl_sp_db_);
-    gsl_spline_free(gsl_sp_ttot_);
-    gsl_spline_free(gsl_sp_tc_);
-    gsl_spline_free(gsl_sp_tb_);
-
-    gsl_interp_accel_free(gsl_ia_dtot_);
-    gsl_interp_accel_free(gsl_ia_dc_);
-    gsl_interp_accel_free(gsl_ia_db_);
-    gsl_interp_accel_free(gsl_ia_ttot_);
-    gsl_interp_accel_free(gsl_ia_tc_);
-    gsl_interp_accel_free(gsl_ia_tb_);
+  ~transfer_CLASS_plugin()
+  {
  }

-  inline double compute(double k, tf_type type) const {
-      gsl_spline *splineT = nullptr;
-      gsl_interp_accel *accT = nullptr;
-      switch(type){
-          case total:   splineT = gsl_sp_dtot_; accT = gsl_ia_dtot_; break;
-          case cdm:     splineT = gsl_sp_dc_;   accT = gsl_ia_dc_;   break;
-          case baryon:  splineT = gsl_sp_db_;   accT = gsl_ia_db_;   break;
-          case vtotal:  splineT = gsl_sp_ttot_; accT = gsl_ia_ttot_; break;
-          case vcdm:    splineT = gsl_sp_tc_;   accT = gsl_ia_tc_;   break;
-          case vbaryon: splineT = gsl_sp_tb_;   accT = gsl_ia_tb_;   break;
-          default:
-            throw std::runtime_error("Invalid type requested in transfer function evaluation");
-      }
+  inline double compute(double k, tf_type type) const
+  {
+    k *= h_;

-      double d = (k<=kmin_)? gsl_spline_eval(splineT, std::log(kmin_), accT) 
-        : gsl_spline_eval(splineT, std::log(k*h_), accT);
-      return -d/(k*k);
+    if (k < kmin_ || k > kmax_)
+    {
+      return 0.0;
+    }
+
+    real_t val(0.0);
+    switch (type)
+    {
+      // values at ztarget:
+    case total:
+      val = delta_m_(k); break;
+    case cdm:
+      val = delta_c_(k); break;
+    case baryon:
+      val = delta_b_(k); break;
+    case vtotal:
+      val = theta_m_(k); break;
+    case vcdm:
+      val = theta_c_(k); break;
+    case vbaryon:
+      val = theta_b_(k); break;
+
+      // values at zstart:
+    case total0:
+      val = delta_m0_(k); break;
+    case cdm0:
+      val = delta_c0_(k); break;
+    case baryon0:
+      val = delta_b0_(k); break;
+    case vtotal0:
+      val = theta_m0_(k); break;
+    case vcdm0:
+      val = theta_c0_(k); break;
+    case vbaryon0:
+      val = theta_b0_(k); break;
+    default:
+      throw std::runtime_error("Invalid type requested in transfer function evaluation");
+    }
+    return val * tnorm_;
  }

-  inline double get_kmin(void) const { return std::exp(tab_lnk_[0])/h_; }
-  inline double get_kmax(void) const { return std::exp(tab_lnk_[tab_lnk_.size()-1])/h_; }
+  inline double get_kmin(void) const { return kmin_ / h_; }
+  inline double get_kmax(void) const { return kmax_ / h_; }
 };

-namespace {
+namespace
+{
 TransferFunction_plugin_creator_concrete<transfer_CLASS_plugin> creator("CLASS");
 }

--- a/src/plugins/transfer_eisenstein.cc
+++ b/src/plugins/transfer_eisenstein.cc
@ -207,13 +207,13 @@ public:
 	 \param Tcmb mean temperature of the CMB fluctuations (defaults to
 	 Tcmb = 2.726 if not specified)
 	 */
-  transfer_eisenstein_plugin(ConfigFile &cf)
+  transfer_eisenstein_plugin(config_file &cf)
      : TransferFunction_plugin(cf)
  {
-    double Tcmb = pcf_->GetValueSafe<double>("cosmology", "Tcmb", 2.726);
-    double H0 = pcf_->GetValue<double>("cosmology", "H0");
-    double Omega_m = pcf_->GetValue<double>("cosmology", "Omega_m");
-    double Omega_b = pcf_->GetValue<double>("cosmology", "Omega_b");
+    double Tcmb = pcf_->get_value_safe<double>("cosmology", "Tcmb", 2.726);
+    double H0 = pcf_->get_value<double>("cosmology", "H0");
+    double Omega_m = pcf_->get_value<double>("cosmology", "Omega_m");
+    double Omega_b = pcf_->get_value<double>("cosmology", "Omega_b");

    etf_.set_parameters(H0, Omega_m, Omega_b, Tcmb);
    
@ -257,15 +257,15 @@ protected:
  };

 public:
-  transfer_eisenstein_wdm_plugin(ConfigFile &cf)
+  transfer_eisenstein_wdm_plugin(config_file &cf)
      : TransferFunction_plugin(cf)
  {
-    double Tcmb = pcf_->GetValueSafe("cosmology", "Tcmb", 2.726);
-    omegam_ = pcf_->GetValue<double>("cosmology", "Omega_m");
-    omegab_ = pcf_->GetValue<double>("cosmology", "Omega_b");
-    H0_ = pcf_->GetValue<double>("cosmology", "H0");
+    double Tcmb = pcf_->get_value_safe("cosmology", "Tcmb", 2.726);
+    omegam_ = pcf_->get_value<double>("cosmology", "Omega_m");
+    omegab_ = pcf_->get_value<double>("cosmology", "Omega_b");
+    H0_ = pcf_->get_value<double>("cosmology", "H0");
    m_h0 = H0_ / 100.0;
-    wdmm_ = pcf_->GetValue<double>("cosmology", "WDMmass");
+    wdmm_ = pcf_->get_value<double>("cosmology", "WDMmass");

    etf_.set_parameters(H0_, omegam_, omegab_, Tcmb);

@ -273,7 +273,7 @@ public:
    typemap_.insert(std::pair<std::string, int>("VIEL", wdm_viel));             // add the other types
    typemap_.insert(std::pair<std::string, int>("BODE_WRONG", wdm_bode_wrong)); // add the other types

-    type_ = pcf_->GetValueSafe<std::string>("cosmology", "WDMtftype", "BODE");
+    type_ = pcf_->get_value_safe<std::string>("cosmology", "WDMtftype", "BODE");

    //type_ = std::string( toupper( type_.c_str() ) );

@ -286,29 +286,29 @@ public:
    {
    //... parameterisation from Bode et al. (2001), ApJ, 556, 93
    case wdm_bode:
-      wdmnu_ = pcf_->GetValueSafe<double>("cosmology", "WDMnu", 1.0);
-      wdmgx_ = pcf_->GetValueSafe<double>("cosmology", "WDMg_x", 1.5);
+      wdmnu_ = pcf_->get_value_safe<double>("cosmology", "WDMnu", 1.0);
+      wdmgx_ = pcf_->get_value_safe<double>("cosmology", "WDMg_x", 1.5);
      m_WDMalpha = 0.05 * pow(omegam_ / 0.4, 0.15) * pow(H0_ * 0.01 / 0.65, 1.3) * pow(wdmm_, -1.15) * pow(1.5 / wdmgx_, 0.29);

      break;

    //... parameterisation from Viel et al. (2005), Phys Rev D, 71
    case wdm_viel:
-      wdmnu_ = pcf_->GetValueSafe<double>("cosmology", "WDMnu", 1.12);
+      wdmnu_ = pcf_->get_value_safe<double>("cosmology", "WDMnu", 1.12);
      m_WDMalpha = 0.049 * pow(omegam_ / 0.25, 0.11) * pow(H0_ * 0.01 / 0.7, 1.22) * pow(wdmm_, -1.11);
      break;

    //.... below is for historical reasons due to the buggy parameterisation
    //.... in early versions of MUSIC, but apart from H instead of h, Bode et al.
    case wdm_bode_wrong:
-      wdmnu_ = pcf_->GetValueSafe<double>("cosmology", "WDMnu", 1.0);
-      wdmgx_ = pcf_->GetValueSafe<double>("cosmology", "WDMg_x", 1.5);
+      wdmnu_ = pcf_->get_value_safe<double>("cosmology", "WDMnu", 1.0);
+      wdmgx_ = pcf_->get_value_safe<double>("cosmology", "WDMg_x", 1.5);
      m_WDMalpha = 0.05 * pow(omegam_ / 0.4, 0.15) * pow(H0_ / 0.65, 1.3) * pow(wdmm_, -1.15) * pow(1.5 / wdmgx_, 0.29);
      break;

    default:
-      wdmnu_ = pcf_->GetValueSafe<double>("cosmology", "WDMnu", 1.0);
-      wdmgx_ = pcf_->GetValueSafe<double>("cosmology", "WDMg_x", 1.5);
+      wdmnu_ = pcf_->get_value_safe<double>("cosmology", "WDMnu", 1.0);
+      wdmgx_ = pcf_->get_value_safe<double>("cosmology", "WDMg_x", 1.5);
      m_WDMalpha = 0.05 * pow(omegam_ / 0.4, 0.15) * pow(H0_ * 0.01 / 0.65, 1.3) * pow(wdmm_, -1.15) * pow(1.5 / wdmgx_, 0.29);
      break;
    }
@ -340,20 +340,20 @@ protected:
  eisenstein_transfer etf_;

 public:
-  transfer_eisenstein_cdmbino_plugin(ConfigFile &cf)
+  transfer_eisenstein_cdmbino_plugin(config_file &cf)
      : TransferFunction_plugin(cf)
  { 
-    double Tcmb = pcf_->GetValueSafe("cosmology", "Tcmb", 2.726);
+    double Tcmb = pcf_->get_value_safe("cosmology", "Tcmb", 2.726);

-    omegam_ = pcf_->GetValue<double>("cosmology", "Omega_m");
-    omegab_ = pcf_->GetValue<double>("cosmology", "Omega_b");
-    H0_ = pcf_->GetValue<double>("cosmology", "H0");
+    omegam_ = pcf_->get_value<double>("cosmology", "Omega_m");
+    omegab_ = pcf_->get_value<double>("cosmology", "Omega_b");
+    H0_ = pcf_->get_value<double>("cosmology", "H0");
    m_h0 = H0_ / 100.0;

    etf_.set_parameters(H0_, omegam_, omegab_, Tcmb);

-    mcdm_ = pcf_->GetValueSafe<double>("cosmology", "CDM_mass", 100.0); // bino particle mass in GeV
-    Tkd_ = pcf_->GetValueSafe<double>("cosmology", "CDM_Tkd", 33.0);    // temperature at which CDM particle kinetically decouples (in MeV)
+    mcdm_ = pcf_->get_value_safe<double>("cosmology", "CDM_mass", 100.0); // bino particle mass in GeV
+    Tkd_ = pcf_->get_value_safe<double>("cosmology", "CDM_Tkd", 33.0);    // temperature at which CDM particle kinetically decouples (in MeV)

    kfs_ = 1.7e6 / m_h0 * sqrt(mcdm_ / 100. * Tkd_ / 30.) / (1.0 + log(Tkd_ / 30.) / 19.2);
    kd_ = 3.8e7 / m_h0 * sqrt(mcdm_ / 100. * Tkd_ / 30.);
@ -395,19 +395,19 @@ protected:
  eisenstein_transfer etf_;

 public:
-  transfer_eisenstein_cutoff_plugin(ConfigFile &cf)
+  transfer_eisenstein_cutoff_plugin(config_file &cf)
      : TransferFunction_plugin(cf)
  { 
-    double Tcmb = pcf_->GetValueSafe("cosmology", "Tcmb", 2.726);
+    double Tcmb = pcf_->get_value_safe("cosmology", "Tcmb", 2.726);

-    omegam_ = pcf_->GetValue<double>("cosmology", "Omega_m");
-    omegab_ = pcf_->GetValue<double>("cosmology", "Omega_b");
-    H0_ = pcf_->GetValue<double>("cosmology", "H0");
+    omegam_ = pcf_->get_value<double>("cosmology", "Omega_m");
+    omegab_ = pcf_->get_value<double>("cosmology", "Omega_b");
+    H0_ = pcf_->get_value<double>("cosmology", "H0");
    m_h0 = H0_ / 100.0;

    etf_.set_parameters(H0_, omegam_, omegab_, Tcmb);

-    Rcut_ = pcf_->GetValueSafe<double>("cosmology", "Rcut", 1.0);
+    Rcut_ = pcf_->get_value_safe<double>("cosmology", "Rcut", 1.0);
  }

  inline double compute(double k, tf_type type) const
@ -434,5 +434,5 @@ namespace
 TransferFunction_plugin_creator_concrete<transfer_eisenstein_plugin> creator("eisenstein");
 TransferFunction_plugin_creator_concrete<transfer_eisenstein_wdm_plugin> creator2("eisenstein_wdm");
 TransferFunction_plugin_creator_concrete<transfer_eisenstein_cdmbino_plugin> creator3("eisenstein_cdmbino");
-TransferFunction_plugin_creator_concrete<transfer_eisenstein_cutoff_plugin> creator4("eisenstein_cutoff");
+// TransferFunction_plugin_creator_concrete<transfer_eisenstein_cutoff_plugin> creator4("eisenstein_cutoff");
 } // namespace
--- a/src/random_plugin.cc
+++ b/src/random_plugin.cc
@ -13,32 +13,33 @@ void print_RNG_plugins()
    std::map<std::string, RNG_plugin_creator *> &m = get_RNG_plugin_map();
    std::map<std::string, RNG_plugin_creator *>::iterator it;
    it = m.begin();
-    csoca::ilog << "- Available random number generator plug-ins:" << std::endl;
+    music::ilog << "Available random number generator plug-ins:" << std::endl;
    while (it != m.end())
    {
        if ((*it).second){
-            csoca::ilog.Print("\t\'%s\'\n", (*it).first.c_str());
+            music::ilog.Print("\t\'%s\'\n", (*it).first.c_str());
        }
        ++it;
    }
+    music::ilog << std::endl;
 }

-std::unique_ptr<RNG_plugin> select_RNG_plugin(ConfigFile &cf)
+std::unique_ptr<RNG_plugin> select_RNG_plugin(config_file &cf)
 {
-    std::string rngname = cf.GetValueSafe<std::string>("random", "generator", "MUSIC");
+    std::string rngname = cf.get_value_safe<std::string>("random", "generator", "MUSIC");

    RNG_plugin_creator *the_RNG_plugin_creator = get_RNG_plugin_map()[rngname];

    if (!the_RNG_plugin_creator)
    {
-        csoca::ilog.Print("Invalid/Unregistered random number generator plug-in encountered : %s", rngname.c_str());
+        music::ilog.Print("Invalid/Unregistered random number generator plug-in encountered : %s", rngname.c_str());
        print_RNG_plugins();
        throw std::runtime_error("Unknown random number generator plug-in");
    }
    else
    {
-        csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
-        csoca::ilog << std::setw(32) << std::left << "Random number generator plugin" << " : " << rngname << std::endl;
+        music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        music::ilog << std::setw(32) << std::left << "Random number generator plugin" << " : " << rngname << std::endl;
    }

    return std::move(the_RNG_plugin_creator->Create(cf));
--- a/src/testing.cc
+++ b/src/testing.cc
@ -9,7 +9,7 @@ namespace testing
 {

 void output_potentials_and_densities(
-    ConfigFile &the_config,
+    config_file &the_config,
    size_t ngrid, real_t boxlen,
    Grid_FFT<real_t> &phi,
    Grid_FFT<real_t> &phi2,
@ -17,8 +17,8 @@ void output_potentials_and_densities(
    Grid_FFT<real_t> &phi3b,
    std::array<Grid_FFT<real_t> *, 3> &A3)
 {
-    const std::string fname_hdf5 = the_config.GetValueSafe<std::string>("output", "fname_hdf5", "output.hdf5");
-    const std::string fname_analysis = the_config.GetValueSafe<std::string>("output", "fbase_analysis", "output");
+    const std::string fname_hdf5 = the_config.get_value_safe<std::string>("output", "fname_hdf5", "output.hdf5");
+    const std::string fname_analysis = the_config.get_value_safe<std::string>("output", "fbase_analysis", "output");

    Grid_FFT<real_t> delta({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    Grid_FFT<real_t> delta2({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
@ -98,7 +98,7 @@ void output_potentials_and_densities(
 }

 void output_velocity_displacement_symmetries(
-    ConfigFile &the_config,
+    config_file &the_config,
    size_t ngrid, real_t boxlen, real_t vfac, real_t dplus,
    Grid_FFT<real_t> &phi,
    Grid_FFT<real_t> &phi2,
@ -107,8 +107,8 @@ void output_velocity_displacement_symmetries(
    std::array<Grid_FFT<real_t> *, 3> &A3,
    bool bwrite_out_fields)
 {
-    const std::string fname_hdf5 = the_config.GetValueSafe<std::string>("output", "fname_hdf5", "output.hdf5");
-    const std::string fname_analysis = the_config.GetValueSafe<std::string>("output", "fbase_analysis", "output");
+    const std::string fname_hdf5 = the_config.get_value_safe<std::string>("output", "fname_hdf5", "output.hdf5");
+    const std::string fname_analysis = the_config.get_value_safe<std::string>("output", "fbase_analysis", "output");

    real_t vfac1 = vfac;
    real_t vfac2 = 2 * vfac;
@ -232,7 +232,7 @@ void output_velocity_displacement_symmetries(
    }


-    csoca::ilog << "std. deviation of invariant : ( D+ | I_xy | I_yz | I_zx ) \n"
+    music::ilog << "std. deviation of invariant : ( D+ | I_xy | I_yz | I_zx ) \n"
                    << std::setw(16) << dplus << " "
                    << std::setw(16) << Icomp[0] << " "
                    << std::setw(16) << Icomp[1] << " "
@ -241,7 +241,8 @@ void output_velocity_displacement_symmetries(
 }

 void output_convergence(
-    ConfigFile &the_config,
+    config_file &the_config,
+    cosmology::calculator* the_cosmo_calc,
    std::size_t ngrid, real_t boxlen, real_t vfac, real_t dplus,
    Grid_FFT<real_t> &phi,
    Grid_FFT<real_t> &phi2,
@ -249,7 +250,6 @@ void output_convergence(
    Grid_FFT<real_t> &phi3b,
    std::array<Grid_FFT<real_t> *, 3> &A3)
 {
-
    // scale all potentials to remove dplus0
    phi /= dplus;
    phi2 /= dplus * dplus;
@ -259,11 +259,95 @@ void output_convergence(
    (*A3[1]) /= dplus * dplus * dplus;
    (*A3[2]) /= dplus * dplus * dplus;

+    ////////////////////// theoretical convergence radius //////////////////////
+
+    // compute phi_code
+    Grid_FFT<real_t> phi_code({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+    phi_code.FourierTransformForward(false);
+    #pragma omp parallel for //collapse(3)
+    for (std::size_t i = 0; i < phi_code.size(0); ++i) {
+        for (std::size_t j = 0; j < phi_code.size(1); ++j) {
+            for (std::size_t k = 0; k < phi_code.size(2); ++k) {
+                std::size_t idx = phi_code.get_idx(i, j, k);
+                phi_code.kelem(idx) = -phi.kelem(idx);
+            }
+        }
+    }
+
+    // initialize norm to 0
+    Grid_FFT<real_t> nabla_vini_norm({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+    #pragma omp parallel for //collapse(3)
+    for (std::size_t i = 0; i < nabla_vini_norm.size(0); ++i) {
+        for (std::size_t j = 0; j < nabla_vini_norm.size(1); ++j) {
+            for (std::size_t k = 0; k < nabla_vini_norm.size(2); ++k) {
+                std::size_t idx = nabla_vini_norm.get_idx(i, j, k);
+                nabla_vini_norm.relem(idx) = 0.0;
+            }
+        }
+    }
+
+    Grid_FFT<real_t> nabla_vini_mn({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+    for(std::size_t m = 0; m < 3; m++) {
+        for(std::size_t n = m; n < 3; n++) {
+            nabla_vini_mn.FourierTransformForward(false);
+            #pragma omp parallel for //collapse(3)
+            for (std::size_t i = 0; i < phi_code.size(0); ++i) {
+                for (std::size_t j = 0; j < phi_code.size(1); ++j) {
+                    for (std::size_t k = 0; k < phi_code.size(2); ++k) {
+                        std::size_t idx = phi_code.get_idx(i, j, k);
+                        auto kk = phi_code.get_k<real_t>(i, j, k);
+                        nabla_vini_mn.kelem(idx) = phi_code.kelem(idx) * (kk[m] * kk[n]);
+                    }
+                }
+            }
+            nabla_vini_mn.FourierTransformBackward();
+            nabla_vini_mn *= (3.2144004915 / the_cosmo_calc->get_growth_factor(1.0));
+            // sum of squares
+            #pragma omp parallel for //collapse(3)
+            for (std::size_t i = 0; i < nabla_vini_norm.size(0); ++i) {
+                for (std::size_t j = 0; j < nabla_vini_norm.size(1); ++j) {
+                    for (std::size_t k = 0; k < nabla_vini_norm.size(2); ++k) {
+                        std::size_t idx = nabla_vini_norm.get_idx(i, j, k);
+                        if(m != n) {
+                            nabla_vini_norm.relem(idx) += (2.0 * nabla_vini_mn.relem(idx) * nabla_vini_mn.relem(idx));
+                        } else {
+                            nabla_vini_norm.relem(idx) += (nabla_vini_mn.relem(idx) * nabla_vini_mn.relem(idx));
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // square root
+    #pragma omp parallel for //collapse(3)
+    for (std::size_t i = 0; i < nabla_vini_norm.size(0); ++i) {
+        for (std::size_t j = 0; j < nabla_vini_norm.size(1); ++j) {
+            for (std::size_t k = 0; k < nabla_vini_norm.size(2); ++k) {
+                std::size_t idx = nabla_vini_norm.get_idx(i, j, k);
+                nabla_vini_norm.relem(idx) = std::sqrt(nabla_vini_norm.relem(idx));
+            }
+        }
+    }
+
+    // get t_eds
+    Grid_FFT<real_t> t_eds({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
+    #pragma omp parallel for //collapse(3)
+    for (std::size_t i = 0; i < t_eds.size(0); ++i) {
+        for (std::size_t j = 0; j < t_eds.size(1); ++j) {
+            for (std::size_t k = 0; k < t_eds.size(2); ++k) {
+                std::size_t idx = t_eds.get_idx(i, j, k);
+                t_eds.relem(idx) = 0.0204 / nabla_vini_norm.relem(idx);
+            }
+        }
+    }
+
+    ////////////////////////// 3lpt convergence test ///////////////////////////
+
    // initialize grids to 0
    Grid_FFT<real_t> psi_1({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    Grid_FFT<real_t> psi_2({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    Grid_FFT<real_t> psi_3({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-#pragma omp parallel for collapse(3)
+    #pragma omp parallel for //collapse(3)
    for (std::size_t i = 0; i < psi_1.size(0); ++i) {
        for (std::size_t j = 0; j < psi_1.size(1); ++j) {
            for (std::size_t k = 0; k < psi_1.size(2); ++k) {
@ -290,7 +374,7 @@ void output_convergence(
        psi_2_tmp.FourierTransformForward(false);
        psi_3_tmp.FourierTransformForward(false);

-#pragma omp parallel for collapse(3)
+        #pragma omp parallel for //collapse(3)
        for (std::size_t i = 0; i < phi.size(0); ++i) {
            for (std::size_t j = 0; j < phi.size(1); ++j) {
                for (std::size_t k = 0; k < phi.size(2); ++k) {
@ -311,7 +395,7 @@ void output_convergence(
        psi_3_tmp.FourierTransformBackward();

        // sum of squares
-#pragma omp parallel for collapse(3)
+        #pragma omp parallel for //collapse(3)
        for (std::size_t i = 0; i < psi_1.size(0); ++i) {
            for (std::size_t j = 0; j < psi_1.size(1); ++j) {
                for (std::size_t k = 0; k < psi_1.size(2); ++k) {
@ -325,7 +409,7 @@ void output_convergence(
    } // loop on dimensions

    // apply square root for the L2 norm
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for //collapse(3)
    for (std::size_t i = 0; i < psi_1.size(0); ++i) {
        for (std::size_t j = 0; j < psi_1.size(1); ++j) {
            for (std::size_t k = 0; k < psi_1.size(2); ++k) {
@ -339,7 +423,7 @@ void output_convergence(

    // convergence radius
    Grid_FFT<real_t> inv_convergence_radius({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-#pragma omp parallel for collapse(3)
+    #pragma omp parallel for //collapse(3)
    for (std::size_t i = 0; i < psi_1.size(0); ++i) {
        for (std::size_t j = 0; j < psi_1.size(1); ++j) {
            for (std::size_t k = 0; k < psi_1.size(2); ++k) {
@ -351,13 +435,17 @@ void output_convergence(
        }
    }

-    // write results
-    unlink("convergence_test.hdf5");
-    inv_convergence_radius.Write_to_HDF5("convergence_test.hdf5", "inv_convergence_radius");
-    psi_1.Write_to_HDF5("convergence_test.hdf5", "psi_1_norm");
-    psi_2.Write_to_HDF5("convergence_test.hdf5", "psi_2_norm");
-    psi_3.Write_to_HDF5("convergence_test.hdf5", "psi_3_norm");
-
+    ////////////////////////////// write results ///////////////////////////////
+    std::string convergence_test_filename("convergence_test.hdf5");
+    unlink(convergence_test_filename.c_str());
+#if defined(USE_MPI)
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    t_eds.Write_to_HDF5(convergence_test_filename, "t_eds");
+    inv_convergence_radius.Write_to_HDF5(convergence_test_filename, "inv_convergence_radius");
+    // psi_1.Write_to_HDF5(convergence_test_filename, "psi_1_norm");
+    // psi_2.Write_to_HDF5(convergence_test_filename, "psi_2_norm");
+    // psi_3.Write_to_HDF5(convergence_test_filename, "psi_3_norm");
 }

 } // namespace testing
--- a/src/transfer_function_plugin.cc
+++ b/src/transfer_function_plugin.cc
@ -13,31 +13,32 @@ void print_TransferFunction_plugins()
    std::map<std::string, TransferFunction_plugin_creator *> &m = get_TransferFunction_plugin_map();
    std::map<std::string, TransferFunction_plugin_creator *>::iterator it;
    it = m.begin();
-    csoca::ilog << "Available transfer function plug-ins:" << std::endl;
+    music::ilog << "Available transfer function plug-ins:" << std::endl;
    while (it != m.end())
    {
        if ((*it).second)
-            csoca::ilog << "\t\'" << (*it).first << "\'" << std::endl;
+            music::ilog << "\t\'" << (*it).first << "\'" << std::endl;
        ++it;
    }
+    music::ilog << std::endl;
 }

-std::unique_ptr<TransferFunction_plugin> select_TransferFunction_plugin(ConfigFile &cf)
+std::unique_ptr<TransferFunction_plugin> select_TransferFunction_plugin(config_file &cf)
 {
-    std::string tfname = cf.GetValue<std::string>("cosmology", "transfer");
+    std::string tfname = cf.get_value<std::string>("cosmology", "transfer");

    TransferFunction_plugin_creator *the_TransferFunction_plugin_creator = get_TransferFunction_plugin_map()[tfname];

    if (!the_TransferFunction_plugin_creator)
    {
-        csoca::elog << "Invalid/Unregistered transfer function plug-in encountered : " << tfname << std::endl;
+        music::elog << "Invalid/Unregistered transfer function plug-in encountered : " << tfname << std::endl;
        print_TransferFunction_plugins();
        throw std::runtime_error("Unknown transfer function plug-in");
    }
    else
    {
-        csoca::ilog << "-------------------------------------------------------------------------------" << std::endl;
-        csoca::ilog << std::setw(32) << std::left << "Transfer function plugin" << " : " << tfname << std::endl;
+        music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        music::ilog << std::setw(32) << std::left << "Transfer function plugin" << " : " << tfname << std::endl;
    }

    return std::move(the_TransferFunction_plugin_creator->create(cf));
				`@ -0,0 +1 @@`
				`Subproject commit ec6b82cc1122ba029a7a7142cf836014e992e68c`