WIP refactoring. Got rid of FFTW2, some of the old single/double precision templating,...

2024-09-19 17:03:46 +02:00 · 2023-02-14 17:12:19 -08:00 · 2023-02-14 17:12:19 -08:00 · 6b32a7d6e4
commit 6b32a7d6e4
parent 265981d93e
27 changed files with 3654 additions and 4400 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,21 @@
-cmake_minimum_required(VERSION 3.9)
+# This file is part of MUSIC2
 # A software package to generate ICs for cosmological simulations
 # Copyright (C) 2023 by Oliver Hahn
 # 
 # monofonIC is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # 
 # monofonIC is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 cmake_minimum_required(VERSION 3.11)
 set(PRGNAME MUSIC)
 project(MUSIC)
@ -27,12 +44,10 @@ mark_as_advanced(CMAKE_CXX_FLAGS_DEBUGSANADD CMAKE_CXX_FLAGS_DEBUGSANUNDEF)
 mark_as_advanced(CMAKE_C_FLAGS_DEBUGSANADD CMAKE_C_FLAGS_DEBUGSANUNDEF)
 mark_as_advanced(CMAKE_EXECUTABLE_FORMAT CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT)
 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -Wall -pedantic -DCMAKE_BUILD")
 find_package(PkgConfig REQUIRED)
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}")
 option(MUSIC_ENABLE_SINGLE_PRECISION "Enable Single Precision Mode" OFF)
 ########################################################################################################################
 # OpenMP
@ -52,12 +67,44 @@ find_package(Threads REQUIRED)
 if(POLICY CMP0074)
    cmake_policy(SET CMP0074 NEW)
 endif()
-find_package(FFTW3 COMPONENTS SINGLE DOUBLE THREADS)
+if(ENABLE_MPI)
  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS MPI)
 else()
  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS)
 endif(ENABLE_MPI)
 mark_as_advanced(FFTW3_SINGLE_MPI_LIBRARY FFTW3_SINGLE_OPENMP_LIBRARY FFTW3_SINGLE_SERIAL_LIBRARY FFTW3_SINGLE_THREADS_LIBRARY)
 mark_as_advanced(FFTW3_DOUBLE_MPI_LIBRARY FFTW3_DOUBLE_OPENMP_LIBRARY FFTW3_DOUBLE_SERIAL_LIBRARY FFTW3_DOUBLE_THREADS_LIBRARY)
 mark_as_advanced(FFTW3_LONGDOUBLE_MPI_LIBRARY FFTW3_LONGDOUBLE_OPENMP_LIBRARY FFTW3_LONGDOUBLE_SERIAL_LIBRARY FFTW3_LONGDOUBLE_THREADS_LIBRARY)
 mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_MPI_INCLUDE_DIR)
 mark_as_advanced(pkgcfg_lib_PC_FFTW_fftw3)
 ########################################################################################################################
 # TIRPC, needed only for Tipsy format
 find_package(TIRPC)
 ########################################################################################################################
 # GSL
 find_package(GSL REQUIRED)
 mark_as_advanced(pkgcfg_lib_GSL_gsl pkgcfg_lib_GSL_gslcblas pkgcfg_lib_GSL_m)
 ########################################################################################################################
 # HDF5
 find_package(HDF5)
 if( HDF5_FOUND )
  mark_as_advanced(HDF5_C_LIBRARY_dl HDF5_C_LIBRARY_hdf5 HDF5_C_LIBRARY_m HDF5_C_LIBRARY_pthread HDF5_C_LIBRARY_z HDF5_C_LIBRARY_sz)
 endif()
 ########################################################################################################################
 # floating point precision
 set (
  CODE_PRECISION "DOUBLE"
  CACHE STRING "Floating point type used for internal computations and FFTs"
 )
 set_property (
  CACHE CODE_PRECISION
  PROPERTY STRINGS FLOAT DOUBLE LONGDOUBLE
 )
 ########################################################################################################################
 # Add a custom command that produces version.cc, plus
 # a dummy output that's not actually produced, in order
@ -68,16 +115,6 @@ ADD_CUSTOM_COMMAND(
    COMMAND ${CMAKE_COMMAND} -P
            ${CMAKE_CURRENT_SOURCE_DIR}/version.cmake)
 ########################################################################################################################
 # GSL
 find_package(GSL REQUIRED)
 ########################################################################################################################
 # HDF5
 find_package(HDF5)
 ########################################################################################################################
 # INCLUDES
 include_directories(${PROJECT_SOURCE_DIR}/src)
@ -112,40 +149,38 @@ list (APPEND SOURCES
 # target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
 endif(ENABLE_PANPHASIA)
 # project configuration header
 configure_file(
  ${PROJECT_SOURCE_DIR}/src/cmake_config.hh.in
  ${PROJECT_SOURCE_DIR}/src/cmake_config.hh
 )
 add_executable(${PRGNAME} ${SOURCES} ${PLUGINS})
-set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 11)
+set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 14)
-if(FFTW3_FOUND)
+if(CODE_PRECISION STREQUAL "FLOAT")
  target_compile_options(${PRGNAME} PRIVATE "-DFFTW3")
  if( MUSIC_ENABLE_SINGLE_PRECISION )
    target_compile_options(${PRGNAME} PRIVATE "-DSINGLE_PRECISION")
  if(FFTW3_SINGLE_THREADS_FOUND)
-      target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_THREADS_LIBRARY})
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_THREADS)
-      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
+    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
    elseif(FFTW3_SINGLE_SERIAL_FOUND)
      target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_SERIAL_LIBRARY})
      message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support")
    else()  
      message( FATAL "chose compilation in single precision, but FFTW3 not found for single precision")
  endif()
-  else(MUSIC_ENABLE_SINGLE_PRECISION)
+  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_SERIAL) 
 elseif(CODE_PRECISION STREQUAL "DOUBLE")
  if(FFTW3_DOUBLE_THREADS_FOUND) 
-      target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_THREADS_LIBRARY})
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_THREADS)
-      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
+    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
-    elseif(FFTW3_DOUBLE_SERIAL_FOUND)
+  endif()
-      target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_SERIAL_LIBRARY})
+  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_SERIAL) 
-      message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support")
+elseif(CODE_PRECISION STREQUAL "LONGDOUBLE")
-    else()  
+  if(FFTW3_LONGDOUBLE_THREADS_FOUND) 
-      message( FATAL "chose compilation in double precision, but FFTW3 not found for double precision")
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_THREADS)
    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
  endif()
  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_SERIAL)
 endif()
  endif(MUSIC_ENABLE_SINGLE_PRECISION)
 endif(FFTW3_FOUND)
 if(HDF5_FOUND)
-  # target_link_libraries(${PRGNAME} ${HDF5_C_LIBRARY_DIRS})
+  target_link_libraries(${PRGNAME} PRIVATE ${HDF5_LIBRARIES})
  target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
  target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIRS})
  target_compile_options(${PRGNAME} PRIVATE "-DHAVE_HDF5")
  target_compile_options(${PRGNAME} PRIVATE "-DH5_USE_16_API")
@ -161,11 +196,5 @@ if(ENABLE_PANPHASIA)
  target_compile_options(${PRGNAME} PRIVATE "-DHAVE_PANPHASIA")
 endif(ENABLE_PANPHASIA)
-target_link_libraries(${PRGNAME} ${FFTW3_LIBRARIES})
+target_link_libraries(${PRGNAME} PRIVATE GSL::gsl)
 target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIRS})
 target_link_libraries(${PRGNAME} ${GSL_LIBRARIES})
 target_include_directories(${PRGNAME} PRIVATE ${GSL_INCLUDE_DIR})
 target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
 target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIR})
--- a/FindFFTW3.cmake
+++ b/FindFFTW3.cmake
@ -230,3 +230,14 @@ find_package_handle_standard_args(FFTW3
    VERSION_VAR FFTW3_VERSION_STRING
    HANDLE_COMPONENTS
 )
 if(FFTW3_FOUND)
  foreach(component ${FFTW3_FIND_COMPONENTS})
    if(NOT TARGET FFTW3::FFTW3_${component})
      add_library(FFTW3::FFTW3_${component} UNKNOWN IMPORTED)
      set_target_properties(FFTW3::FFTW3_${component} PROPERTIES 
        IMPORTED_LOCATION "${FFTW3_${component}_LIBRARY}"
        INTERFACE_INCLUDE_DIRECTORIES ${FFTW3_INCLUDE_DIR})
    endif()
  endforeach()
 endif()
--- a/src/TransferFunction.hh
+++ b/src/TransferFunction.hh
@ -130,7 +130,7 @@ protected:
 		double fftnorm = 1.0/N;
-		fftw_complex in[N], out[N];
+		complex_t in[N], out[N];
 		fftw_plan p,ip;
 		//... perform anti-ringing correction from Hamilton (2000)
--- a/src/cmake_config.hh.in
+++ b/src/cmake_config.hh.in
@ -0,0 +1,25 @@
 #pragma once
 #define USE_PRECISION_${CODE_PRECISION}
 #ifdef __cplusplus
 constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
 #if defined(USE_PRECISION_FLOAT)
  constexpr char CMAKE_PRECISION_STR[] = "single";
 #elif defined(USE_PRECISION_DOUBLE)
  constexpr char CMAKE_PRECISION_STR[] = "double";
 #elif defined(USE_PRECISION_LONGDOUBLE)
  constexpr char CMAKE_PRECISION_STR[] = "long double";
 #endif 
 // These variables are autogenerated and compiled
 // into the library by the version.cmake script. do not touch!
 extern "C"
 {
  extern const char *GIT_TAG;
  extern const char *GIT_REV;
  extern const char *GIT_BRANCH;
 }
 #endif // __cplusplus
--- a/src/constraints.cc
+++ b/src/constraints.cc
@ -267,7 +267,7 @@ constraint_set::constraint_set( config_file& cf, transfer_function *ptf )
 }
-void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw )
+void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, complex_t* cw )
 {
 	double lsub = nx*dx;
 	double dk = 2.0*M_PI/lsub, d3k=dk*dk*dk;
@ -374,7 +374,7 @@ void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t
-void constraint_set::wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 )
+void constraint_set::wnoise_constr_corr( double dx, complex_t* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 )
 {
 	size_t nconstr = cset_.size();
 	size_t nzp=nz/2+1;
--- a/src/constraints.hh
+++ b/src/constraints.hh
@ -7,20 +7,17 @@
 Copyright (C) 2010  Oliver Hahn
 */
-
+#pragma once
 #ifndef __CONSTRAINTS_HH
 #define __CONSTRAINTS_HH
 #include <vector>
 #include <complex>
 #include <gsl/gsl_linalg.h>
-#include "general.hh"
+#include <general.hh>
-#include "config_file.hh"
+#include <config_file.hh>
-#include "transfer_function.hh"
+#include <transfer_function.hh>
-#include "cosmology.hh"
+#include <cosmology.hh>
 //! matrix class serving as a gsl wrapper
 class matrix
@ -57,10 +54,14 @@ public:
 	}
 	double &operator()(size_t i, size_t j)
-	{	return *gsl_matrix_ptr( m_, i, j );	}
+	{
 		return *gsl_matrix_ptr(m_, i, j);
 	}
 	const double &operator()(size_t i, size_t j) const
-	{	return *gsl_matrix_const_ptr( m_, i, j );	}
+	{
 		return *gsl_matrix_const_ptr(m_, i, j);
 	}
 	matrix &operator=(const matrix &o)
 	{
@ -73,7 +74,6 @@ public:
 		return *this;
 	}
 	matrix &invert()
 	{
 		if (M_ != N_)
@ -94,17 +94,20 @@ public:
 	}
 };
 //! class to impose constraints on the white noise field (van de Weygaert & Bertschinger 1996)
 class constraint_set
 {
 public:
-	enum constr_type{ halo, peak };
+	enum constr_type
 	{
 		halo,
 		peak
 	};
 protected:
-	
+	struct constraint
-	struct constraint{
+	{
 		constr_type type;
 		double x, y, z;
 		double gx, gy, gz;
@ -121,7 +124,6 @@ protected:
 	double dplus0_;
 	unsigned constr_level_;
 	inline std::complex<double> eval_constr(size_t icon, double kx, double ky, double kz)
 	{
 		double re, im, kdotx, k2;
@ -135,32 +137,16 @@ protected:
 		return std::complex<double>(re, im);
 	}
 #if defined(FFTW3) && defined(SINGLE_PRECISION)
 	//! apply constraints to the white noise
-	void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftwf_complex* cw );
+	void wnoise_constr_corr(double dx, size_t nx, size_t ny, size_t nz, std::vector<double> &g0, matrix &cinv, complex_t *cw);
 	//! measure sigma for each constraint in the unconstrained noise
-	void wnoise_constr_corr( double dx, fftwf_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 );
+	void wnoise_constr_corr(double dx, complex_t *cw, size_t nx, size_t ny, size_t nz, std::vector<double> &g0);
 #else
 	//! apply constraints to the white noise
 	void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw );
 	//! measure sigma for each constraint in the unconstrained noise
 	void wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 );
 #endif
 	//! compute the covariance between the constraints
 	void icov_constr(double dx, size_t nx, size_t ny, size_t nz, matrix &cij);
 public:
 	//! constructor
 	constraint_set(config_file &cf, transfer_function *ptf);
@ -171,7 +157,6 @@ public:
 		delete pcosmo_;
 	}
 	template <typename rng>
 	void apply(unsigned ilevel, int x0[], int lx[], rng *wnoise)
 	{
@ -194,7 +179,6 @@ public:
 				music::wlog.Print("Constraint %d appears to be too large scale", i);
 		}
 		std::vector<double> g0;
 		//		unsigned levelmax = pcf_->get_value<unsigned>("setup","levelmax");
@ -203,31 +187,17 @@ public:
 		bool bperiodic = ilevel == levelmin;
 		double dx = pcf_->get_value<double>("setup", "boxlength") / (1 << ilevel);
 		music::ilog.Print("Computing constrained realization...");
 		if (bperiodic)
 		{
 			//... we are operating on the periodic coarse grid
 			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
-			fftw_real * w = new fftw_real[nx*ny*nzp];
+			real_t *w = new real_t[nx * ny * nzp];
-			
+			complex_t *cw = reinterpret_cast<complex_t *>(w);
-#ifdef FFTW3
+			fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
-	#ifdef SINGLE_PRECISION
+									ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
 			fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
 			fftwf_plan	p  = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
 						ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
 	#else
 			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
 			fftw_plan	p  = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
 						ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
 	#endif
 #else
 			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
 			rfftwnd_plan p	= rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
 						 ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
 #endif
 			double fftnorm = 1.0 / sqrt(nx * ny * nz);
@ -240,40 +210,15 @@ public:
 						w[q] = (*wnoise)((x0[0] + i) % nx, (x0[1] + j) % ny, (x0[2] + k) % nz) * fftnorm;
 					}
-#ifdef FFTW3
+			FFTW_API(execute)(p);
 	#ifdef SINGLE_PRECISION
 			fftwf_execute( p );
 	#else
 			fftw_execute( p );
 	#endif
 #else
 #ifndef SINGLETHREAD_FFTW		
 			rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
 #else
 			rfftwnd_one_real_to_complex( p, w, NULL );
 #endif
 #endif
 			wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
 			matrix c(2, 2);
 			icov_constr(dx, nx, ny, nz, c);
 			wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
-#ifdef FFTW3
+			FFTW_API(execute)(ip);
 	#ifdef SINGLE_PRECISION
 			fftwf_execute( ip );
 	#else
 			fftw_execute( ip );
 	#endif
 #else
 #ifndef SINGLETHREAD_FFTW		
 			rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
 #else
 			rfftwnd_one_complex_to_real( ip, cw, NULL );
 #endif
 #endif
 #pragma omp parallel for
 			for (int i = 0; i < (int)nx; i++)
@ -286,42 +231,22 @@ public:
 			music::ilog.Print("Applied constraints to level %d.", ilevel);
 			delete[] w;
-			
+			FFTW_API(destroy_plan)(p);
-#ifdef FFTW3
+			FFTW_API(destroy_plan)(ip);
-	#ifdef SINGLE_PRECISION
+		}
-			fftwf_destroy_plan(p);
+		else
-	#else
+		{
 			fftw_destroy_plan(p);
 	#endif
 #else
 			fftwnd_destroy_plan(p);
 #endif
 		}else{
 			//... we are operating on a refinement grid, not necessarily the finest
 			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
-			fftw_real * w = new fftw_real[nx*ny*nzp];
+			real_t *w = new real_t[nx * ny * nzp];
-			
+			complex_t *cw = reinterpret_cast<complex_t *>(w);
-#ifdef FFTW3
+			fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
-	#ifdef SINGLE_PRECISION
+									ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
 			fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
 			fftwf_plan	p  = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
 						ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
 	#else
 			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
 			fftw_plan	p  = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
 						ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
 	#endif
 #else
 			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
 			rfftwnd_plan p	= rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
 			ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
 #endif
 			double fftnorm = 1.0 / sqrt(nx * ny * nz);
@ -338,7 +263,6 @@ public:
 							w[q] = (*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) * fftnorm;
 						else
 							w[q] = 0.0;
 					}
 			int nlvl05 = 1 << (ilevel - 1);
@ -351,40 +275,16 @@ public:
 				cset_[i].gz -= zs;
 			}
-#ifdef FFTW3
+			FFTW_API(execute)(p);
-	#ifdef SINGLE_PRECISION
+
 			fftwf_execute( p );
 	#else
 			fftw_execute( p );
 	#endif
 #else
 #ifndef SINGLETHREAD_FFTW		
 			rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
 #else
 			rfftwnd_one_real_to_complex( p, w, NULL );
 #endif
 #endif
 			wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
 			matrix c(2, 2);
 			icov_constr(dx, nx, ny, nz, c);
 			wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
-#ifdef FFTW3
+			FFTW_API(execute)(ip);
 	#ifdef SINGLE_PRECISION
 			fftwf_execute( ip );
 	#else
 			fftw_execute( ip );
 	#endif
 #else
 #ifndef SINGLETHREAD_FFTW		
 			rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
 #else
 			rfftwnd_one_complex_to_real( ip, cw, NULL );
 #endif
 #endif
 #pragma omp parallel for
 			for (int i = 0; i < (int)nx; i++)
@ -396,27 +296,12 @@ public:
 							(*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) = w[q] * fftnorm;
 					}
 			music::ilog.Print("Applied constraints to level %d.", ilevel);
 			delete[] w;
-			
+			FFTW_API(destroy_plan)(p);
-#ifdef FFTW3
+			FFTW_API(destroy_plan)(ip);
 	#ifdef SINGLE_PRECISION
 			fftwf_destroy_plan(p);
 	#else
 			fftw_destroy_plan(p);
 	#endif
 #else
 			fftwnd_destroy_plan(p);
 #endif
 		}
 	}
 };
 #endif // __CONSTRAINTS_HH
--- a/src/convolution_kernel.cc
+++ b/src/convolution_kernel.cc
@ -7,13 +7,9 @@
 */
-#include "general.hh"
+#include <general.hh>
-#include "densities.hh"
+#include <densities.hh>
-#include "convolution_kernel.hh"
+#include <convolution_kernel.hh>
 #if defined(FFTW3) && defined(SINGLE_PRECISION)
 typedef fftw_complex fftwf_complex;
 #endif
 namespace convolution
 {
@ -25,7 +21,6 @@ get_kernel_map()
 	return kernel_map;
 }
 template <typename real_t>
 void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 {
 	//return;
@ -34,49 +29,26 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 	double fftnormp = 1.0/sqrt((double)cparam_.nx * (double)cparam_.ny * (double)cparam_.nz);
 	double fftnorm = pow(2.0 * M_PI, 1.5) / sqrt(cparam_.lx * cparam_.ly * cparam_.lz) * fftnormp;
-	fftw_complex *cdata;
+	complex_t *cdata;
-	[[maybe_unused]] fftw_complex *ckernel;
+	[[maybe_unused]] complex_t *ckernel;
-	fftw_real *data;
+	real_t *data;
-	data = reinterpret_cast<fftw_real *>(pd);
+	data = reinterpret_cast<real_t *>(pd);
-	cdata = reinterpret_cast<fftw_complex *>(data);
+	cdata = reinterpret_cast<complex_t *>(data);
-	ckernel = reinterpret_cast<fftw_complex *>(pk->get_ptr());
+	ckernel = reinterpret_cast<complex_t *>(pk->get_ptr());
 	std::cout << "   - Performing density convolution... ("
 			  << cparam_.nx << ", " << cparam_.ny << ", " << cparam_.nz << ")\n";
 	music::ulog.Print("Performing kernel convolution on (%5d,%5d,%5d) grid", cparam_.nx, cparam_.ny, cparam_.nz);
 	music::ulog.Print("Performing forward FFT...");
 #ifdef FFTW3
 #ifdef SINGLE_PRECISION
 	fftwf_plan plan, iplan;
 	plan = fftwf_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
 	iplan = fftwf_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
-	fftwf_execute(plan);
+	fftw_plan_t plan, iplan;
-#else
+	plan = FFTW_API(plan_dft_r2c_3d)(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
-	fftw_plan plan, iplan;
+	iplan = FFTW_API(plan_dft_c2r_3d)(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
 	plan = fftw_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
 	iplan = fftw_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
-	fftw_execute(plan);
+	FFTW_API(execute)(plan);
 #endif
 #else
 	rfftwnd_plan iplan, plan;
 	plan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
 							   FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
 	iplan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
 								FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
 #else
 	rfftwnd_one_real_to_complex(plan, data, NULL);
 #endif
 #endif
 	//..... need a phase shift for baryons for SPH
 	double dstag = 0.0;
@ -163,27 +135,9 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 	music::ulog.Print("Performing backward FFT...");
-#ifdef FFTW3
+	FFTW_API(execute)(iplan);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
-	fftwf_execute(iplan);
+	FFTW_API(destroy_plan)(iplan);
 	fftwf_destroy_plan(plan);
 	fftwf_destroy_plan(iplan);
 #else
 	fftw_execute(iplan);
 	fftw_destroy_plan(plan);
 	fftw_destroy_plan(iplan);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
 #else
 	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
 #endif
 	rfftwnd_destroy_plan(plan);
 	rfftwnd_destroy_plan(iplan);
 #endif
 	// set the DC mode here to avoid a possible truncation error in single precision
 	{
@ -196,14 +150,12 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 	}
 }
-template void perform<double>(kernel *pk, void *pd, bool shift, bool fix, bool flip);
+void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
 template void perform<float>(kernel *pk, void *pd, bool shift, bool fix, bool flip);
 /*****************************************************************************************/
 /***    SPECIFIC KERNEL IMPLEMENTATIONS      *********************************************/
 /*****************************************************************************************/
 template <typename real_t>
 class kernel_k : public kernel
 {
 protected:
@ -298,8 +250,4 @@ public:
 /**************************************************************************************/
 /**************************************************************************************/
-namespace
+convolution::kernel_creator_concrete<convolution::kernel_k> creator_kd("tf_kernel_k");
 {
 convolution::kernel_creator_concrete<convolution::kernel_k<double>> creator_kd("tf_kernel_k_double");
 convolution::kernel_creator_concrete<convolution::kernel_k<float>> creator_kf("tf_kernel_k_float");
 } // namespace
--- a/src/convolution_kernel.hh
+++ b/src/convolution_kernel.hh
@ -112,7 +112,6 @@ struct kernel_creator_concrete : public kernel_creator
 };
 //! actual implementation of the FFT convolution (independent of the actual kernel)
 template <typename real_t>
 void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
 } //namespace convolution
--- a/src/cosmology.cc
+++ b/src/cosmology.cc
@ -16,11 +16,6 @@
 #define ACC(i, j, k) ((*u.get_grid((ilevel)))((i), (j), (k)))
 #define SQR(x) ((x) * (x))
 #if defined(FFTW3) && defined(SINGLE_PRECISION)
 #define fftw_complex fftwf_complex
 #endif
 void compute_LLA_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
 {
 	fnew = u;
@ -30,7 +25,6 @@ void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 		double h = pow(2.0, ilevel), h2 = h * h, h2_4 = 0.25 * h2;
 		meshvar_bnd *pvar = fnew.get_grid(ilevel);
 		if (order == 2)
 		{
 #pragma omp parallel for // reduction(+:sum_corr,sum,sum2)
@ -52,15 +46,9 @@ void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 						D[1][1] += 1.0;
 						D[2][2] += 1.0;
-						double det = D[0][0]*D[1][1]*D[2][2]
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
 						-	D[0][0]*D[1][2]*D[2][1]
 						-   D[1][0]*D[0][1]*D[2][2]
 						+	D[1][0]*D[0][2]*D[1][2]
 						+	D[2][0]*D[0][1]*D[1][2]
 						-	D[2][0]*D[0][2]*D[1][1];
 						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
 					}
 		}
 		else if (order == 4)
@ -80,20 +68,13 @@ void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 						D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
 						D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
 						D[0][0] += 1.0;
 						D[1][1] += 1.0;
 						D[2][2] += 1.0;
-						double det = D[0][0]*D[1][1]*D[2][2]
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
 						-	D[0][0]*D[1][2]*D[2][1]
 						-   D[1][0]*D[0][1]*D[2][2]
 						+	D[1][0]*D[0][2]*D[1][2]
 						+	D[2][0]*D[0][1]*D[1][2]
 						-	D[2][0]*D[0][2]*D[1][1];
 						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
 					}
 		}
 		else if (order == 6)
@ -112,42 +93,24 @@ void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 						D[2][2] = (2. * ACC(ix, iy, iz - 3) - 27. * ACC(ix, iy, iz - 2) + 270. * ACC(ix, iy, iz - 1) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix, iy, iz + 1) - 27. * ACC(ix, iy, iz + 2) + 2. * ACC(ix, iy, iz + 3)) * h2;
 						//.. this is actually 8th order accurate
-						D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))
+						D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_4;
-											 -8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz)
+						D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_4;
-												+ ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz))
+						D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_4;
 											 +1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_4;
 						D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
 											 -8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
 												+ ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
 											 +1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_4;
 						D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
 											 -8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
 												+ ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
 											 +1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_4;
 						D[0][0] += 1.0;
 						D[1][1] += 1.0;
 						D[2][2] += 1.0;
-						double det = D[0][0]*D[1][1]*D[2][2]
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
 						-	D[0][0]*D[1][2]*D[2][1]
 						-   D[1][0]*D[0][1]*D[2][2]
 						+	D[1][0]*D[0][2]*D[1][2]
 						+	D[2][0]*D[0][1]*D[1][2]
 						-	D[2][0]*D[0][2]*D[1][1];
 						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
 					}
-			
+		}
-		}else
+		else
 			throw std::runtime_error("compute_LLA_density : invalid operator order specified");
 	}
 }
 void compute_Lu_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
 {
 	fnew = u;
@ -169,13 +132,10 @@ void compute_Lu_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned
 					D[2][2] = 1.0 + (ACC(ix, iy, iz - 1) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 1)) * h2;
 					(*pvar)(ix, iy, iz) = -(D[0][0] + D[1][1] + D[2][2] - 3.0);
 				}
 	}
 }
 void compute_2LPT_source_FFT(config_file &cf_, const grid_hierarchy &u, grid_hierarchy &fnew)
 {
 	if (u.levelmin() != u.levelmax())
@ -189,18 +149,24 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 	nzp = 2 * (nz / 2 + 1);
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[nx*ny*nzp];
+	real_t *data = new real_t[nx * ny * nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex*> (data);
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
-	fftw_complex	*cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33;
+	complex_t *cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33;
-	fftw_real		*data_11, *data_12, *data_13, *data_22, *data_23, *data_33;
+	real_t *data_11, *data_12, *data_13, *data_22, *data_23, *data_33;
-	data_11 = new fftw_real[nx*ny*nzp]; cdata_11 = reinterpret_cast<fftw_complex*> (data_11);
+	data_11 = new real_t[nx * ny * nzp];
-	data_12 = new fftw_real[nx*ny*nzp]; cdata_12 = reinterpret_cast<fftw_complex*> (data_12);
+	cdata_11 = reinterpret_cast<complex_t *>(data_11);
-	data_13 = new fftw_real[nx*ny*nzp]; cdata_13 = reinterpret_cast<fftw_complex*> (data_13);
+	data_12 = new real_t[nx * ny * nzp];
-	data_22 = new fftw_real[nx*ny*nzp]; cdata_22 = reinterpret_cast<fftw_complex*> (data_22);
+	cdata_12 = reinterpret_cast<complex_t *>(data_12);
-	data_23 = new fftw_real[nx*ny*nzp]; cdata_23 = reinterpret_cast<fftw_complex*> (data_23);
+	data_13 = new real_t[nx * ny * nzp];
-	data_33 = new fftw_real[nx*ny*nzp]; cdata_33 = reinterpret_cast<fftw_complex*> (data_33);
+	cdata_13 = reinterpret_cast<complex_t *>(data_13);
 	data_22 = new real_t[nx * ny * nzp];
 	cdata_22 = reinterpret_cast<complex_t *>(data_22);
 	data_23 = new real_t[nx * ny * nzp];
 	cdata_23 = reinterpret_cast<complex_t *>(data_23);
 	data_33 = new real_t[nx * ny * nzp];
 	cdata_33 = reinterpret_cast<complex_t *>(data_33);
 #pragma omp parallel for
 	for (int i = 0; i < (int)nx; ++i)
@ -212,36 +178,19 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 			}
 	//... perform FFT and Poisson solve................................
 #ifdef FFTW3
-	#ifdef SINGLE_PRECISION
+	fftw_plan_t
-	fftwf_plan
+			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-		plan  = fftwf_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE),
+			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE),
-		iplan = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE),
+			ip11 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_11, data_11, FFTW_ESTIMATE),
-		ip11  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE),
+			ip12 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_12, data_12, FFTW_ESTIMATE),
-		ip12  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE),
+			ip13 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_13, data_13, FFTW_ESTIMATE),
-		ip13  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE),
+			ip22 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_22, data_22, FFTW_ESTIMATE),
-		ip22  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE),
+			ip23 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_23, data_23, FFTW_ESTIMATE),
-		ip23  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE),
+			ip33 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_33, data_33, FFTW_ESTIMATE);
 		ip33  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
-	fftwf_execute(plan);
+	FFTW_API(execute)
-	
+	(plan);
 	#else
 	fftw_plan
 		plan  = fftw_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE),
 		iplan = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE),
 		ip11  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE),
 		ip12  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE),
 		ip13  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE),
 		ip22  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE),
 		ip23  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE),
 		ip33  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
 	fftw_execute(plan);
 	#endif
 	double kfac = 2.0 * M_PI;
 	double norm = 1.0 / ((double)(nx * ny * nz));
@ -251,8 +200,12 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 		for (size_t j = 0; j < ny; ++j)
 			for (size_t l = 0; l < nz / 2 + 1; ++l)
 			{
-				int ii = i; if(ii>(int)nx/2) ii-=nx;
+				int ii = i;
-				int jj = (int)j; if(jj>(int)ny/2) jj-=ny;
+				if (ii > (int)nx / 2)
 					ii -= nx;
 				int jj = (int)j;
 				if (jj > (int)ny / 2)
 					jj -= ny;
 				double ki = (double)ii;
 				double kj = (double)jj;
 				double kk = (double)l;
@ -284,7 +237,6 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 				cdata_33[idx][0] = -k[2] * k[2] * cdata[idx][0] * norm;
 				cdata_33[idx][1] = -k[2] * k[2] * cdata[idx][1] * norm;
 				if (i == (int)nx / 2 || j == ny / 2 || l == nz / 2)
 				{
 					cdata_11[idx][0] = 0.0;
@ -305,7 +257,6 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 					cdata_33[idx][0] = 0.0;
 					cdata_33[idx][1] = 0.0;
 				}
 			}
 	delete[] data;
@ -316,157 +267,21 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 	 cdata_23[0][0]	= 0.0; cdata_23[0][1]	= 0.0;
 	 cdata_33[0][0]	= 0.0; cdata_33[0][1]	= 0.0;*/
 	FFTW_API(execute)(ip11);
 	FFTW_API(execute)(ip12);
 	FFTW_API(execute)(ip13);
 	FFTW_API(execute)(ip22);
 	FFTW_API(execute)(ip23);
 	FFTW_API(execute)(ip33);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
-	fftwf_execute(ip11);
+	FFTW_API(destroy_plan)(iplan);
-	fftwf_execute(ip12);
+	FFTW_API(destroy_plan)(ip11);
-	fftwf_execute(ip13);
+	FFTW_API(destroy_plan)(ip12);
-	fftwf_execute(ip22);
+	FFTW_API(destroy_plan)(ip13);
-	fftwf_execute(ip23);
+	FFTW_API(destroy_plan)(ip22);
-	fftwf_execute(ip33);
+	FFTW_API(destroy_plan)(ip23);
-	
+	FFTW_API(destroy_plan)(ip33);
 	fftwf_destroy_plan(plan);
 	fftwf_destroy_plan(iplan);
 	fftwf_destroy_plan(ip11);
 	fftwf_destroy_plan(ip12);
 	fftwf_destroy_plan(ip13);
 	fftwf_destroy_plan(ip22);
 	fftwf_destroy_plan(ip23);
 	fftwf_destroy_plan(ip33);
 #else
 	fftw_execute(ip11);
 	fftw_execute(ip12);
 	fftw_execute(ip13);
 	fftw_execute(ip22);
 	fftw_execute(ip23);
 	fftw_execute(ip33);
 	fftw_destroy_plan(plan);
 	fftw_destroy_plan(iplan);
 	fftw_destroy_plan(ip11);
 	fftw_destroy_plan(ip12);
 	fftw_destroy_plan(ip13);
 	fftw_destroy_plan(ip22);
 	fftw_destroy_plan(ip23);
 	fftw_destroy_plan(ip33);
 #endif
 //#endif
 #else
 	rfftwnd_plan 
 		plan = rfftw3d_create_plan( nx,ny,nz,
 								   FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
 		iplan = rfftw3d_create_plan( nx,ny,nz,
 									FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
 	#ifndef SINGLETHREAD_FFTW		
 	rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), plan, data, NULL );
 	#else
 	rfftwnd_one_real_to_complex( plan, data, NULL );
 	#endif
 //#endif
 	//double fac = -1.0/(nx*ny*nz);
 	double kfac = 2.0*M_PI;
 	double norm = 1.0/((double)(nx*ny*nz));
 	#pragma omp parallel for
 	for( int i=0; i<(int)nx; ++i )
 		for( size_t j=0; j<ny; ++j )	
 			for( size_t l=0; l<nz/2+1; ++l )
 			{
 				int ii = (int)i; if(ii>(int)(nx/2)) ii-=(int)nx;
 				int jj = (int)j; if(jj>(int)(ny/2)) jj-=(int)ny;
 				double ki = (double)ii;
 				double kj = (double)jj;
 				double kk = (double)l;
 				double k[3];
 				k[0] = (double)ki * kfac;
 				k[1] = (double)kj * kfac;
 				k[2] = (double)kk * kfac;
 				size_t idx = ((size_t)i*ny+j)*nzp/2+l;
 				//double re = cdata[idx].re;
 				//double im = cdata[idx].im;
 				cdata_11[idx].re = -k[0]*k[0] * cdata[idx].re * norm;
 				cdata_11[idx].im = -k[0]*k[0] * cdata[idx].im * norm;
 				cdata_12[idx].re = -k[0]*k[1] * cdata[idx].re * norm;
 				cdata_12[idx].im = -k[0]*k[1] * cdata[idx].im * norm;
 				cdata_13[idx].re = -k[0]*k[2] * cdata[idx].re * norm;
 				cdata_13[idx].im = -k[0]*k[2] * cdata[idx].im * norm;
 				cdata_22[idx].re = -k[1]*k[1] * cdata[idx].re * norm;
 				cdata_22[idx].im = -k[1]*k[1] * cdata[idx].im * norm;
 				cdata_23[idx].re = -k[1]*k[2] * cdata[idx].re * norm;
 				cdata_23[idx].im = -k[1]*k[2] * cdata[idx].im * norm;
 				cdata_33[idx].re = -k[2]*k[2] * cdata[idx].re * norm;
 				cdata_33[idx].im = -k[2]*k[2] * cdata[idx].im * norm;
 				if( i==(int)(nx/2)||j==ny/2||l==nz/2)
 				{
 					cdata_11[idx].re = 0.0;
 					cdata_11[idx].im = 0.0;
 					cdata_12[idx].re = 0.0;
 					cdata_12[idx].im = 0.0;
 					cdata_13[idx].re = 0.0;
 					cdata_13[idx].im = 0.0;
 					cdata_22[idx].re = 0.0;
 					cdata_22[idx].im = 0.0;
 					cdata_23[idx].re = 0.0;
 					cdata_23[idx].im = 0.0;
 					cdata_33[idx].re = 0.0;
 					cdata_33[idx].im = 0.0;
 				}
 			}
 	delete[] data;
 	/*cdata_11[0].re	= 0.0; cdata_11[0].im	= 0.0;
 	cdata_12[0].re	= 0.0; cdata_12[0].im	= 0.0;
 	cdata_13[0].re	= 0.0; cdata_13[0].im	= 0.0;
 	cdata_22[0].re	= 0.0; cdata_22[0].im	= 0.0;
 	cdata_23[0].re	= 0.0; cdata_23[0].im	= 0.0;
 	cdata_33[0].re	= 0.0; cdata_33[0].im	= 0.0;*/
 #ifndef SINGLETHREAD_FFTW		
 	//rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_11, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_12, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_13, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_22, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_23, NULL );
 	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_33, NULL );
 #else
 	//rfftwnd_one_complex_to_real( iplan, cdata, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_11, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_12, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_13, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_22, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_23, NULL );
 	rfftwnd_one_complex_to_real(iplan, cdata_33, NULL );
 #endif
 	rfftwnd_destroy_plan(plan);
 	rfftwnd_destroy_plan(iplan);
 #endif
 //... copy data ..........................................
 #pragma omp parallel for
@ -480,7 +295,6 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 																									 (data_22[ii] * data_33[ii] - data_23[ii] * data_23[ii]));
 				//(*fnew.get_grid(u.levelmax()))(i,j,k) =
 			}
 	// delete[] data;
@ -516,15 +330,11 @@ void compute_2LPT_source( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 						D[1][1] = (ACC(ix, iy - 2, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 2, iz)) * h2_4;
 						D[2][2] = (ACC(ix, iy, iz - 2) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 2)) * h2_4;
 						D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
 						D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
 						D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
-				(*pvar)(ix,iy,iz) =  ( D[0][0]*D[1][1] - D[0][1]*D[0][1]
+						(*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - D[0][1] * D[0][1] + D[0][0] * D[2][2] - D[0][2] * D[0][2] + D[1][1] * D[2][2] - D[1][2] * D[1][2]);
 						       + D[0][0]*D[2][2] - D[0][2]*D[0][2]
 						       + D[1][1]*D[2][2] - D[1][2]*D[1][2] );
 					}
 		}
 		else if (order == 4 || order == 6)
@ -539,50 +349,21 @@ void compute_2LPT_source( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 						double D[3][3];
-				D[0][0] = ((ACC(ix-4,iy,iz)+ACC(ix+4,iy,iz))
+						D[0][0] = ((ACC(ix - 4, iy, iz) + ACC(ix + 4, iy, iz)) - 16. * (ACC(ix - 3, iy, iz) + ACC(ix + 3, iy, iz)) + 64. * (ACC(ix - 2, iy, iz) + ACC(ix + 2, iy, iz)) + 16. * (ACC(ix - 1, iy, iz) + ACC(ix + 1, iy, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
 					   - 16. * (ACC(ix-3,iy,iz)+ACC(ix+3,iy,iz))
 					   + 64. * (ACC(ix-2,iy,iz)+ACC(ix+2,iy,iz))
 					   + 16. * (ACC(ix-1,iy,iz)+ACC(ix+1,iy,iz))
 					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
-				D[1][1] = ((ACC(ix,iy-4,iz)+ACC(ix,iy+4,iz))
+						D[1][1] = ((ACC(ix, iy - 4, iz) + ACC(ix, iy + 4, iz)) - 16. * (ACC(ix, iy - 3, iz) + ACC(ix, iy + 3, iz)) + 64. * (ACC(ix, iy - 2, iz) + ACC(ix, iy + 2, iz)) + 16. * (ACC(ix, iy - 1, iz) + ACC(ix, iy + 1, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
 					   - 16. * (ACC(ix,iy-3,iz)+ACC(ix,iy+3,iz))
 					   + 64. * (ACC(ix,iy-2,iz)+ACC(ix,iy+2,iz))
 					   + 16. * (ACC(ix,iy-1,iz)+ACC(ix,iy+1,iz))
 					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
-				D[2][2] = ((ACC(ix,iy,iz-4)+ACC(ix,iy,iz+4))
+						D[2][2] = ((ACC(ix, iy, iz - 4) + ACC(ix, iy, iz + 4)) - 16. * (ACC(ix, iy, iz - 3) + ACC(ix, iy, iz + 3)) + 64. * (ACC(ix, iy, iz - 2) + ACC(ix, iy, iz + 2)) + 16. * (ACC(ix, iy, iz - 1) + ACC(ix, iy, iz + 1)) - 130. * ACC(ix, iy, iz)) * h2_144;
 					   - 16. * (ACC(ix,iy,iz-3)+ACC(ix,iy,iz+3))
 					   + 64. * (ACC(ix,iy,iz-2)+ACC(ix,iy,iz+2))
 					   + 16. * (ACC(ix,iy,iz-1)+ACC(ix,iy,iz+1))
 					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
 						D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_144;
 						D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_144;
 						D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_144;
-				D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))
+						(*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - SQR(D[0][1]) + D[0][0] * D[2][2] - SQR(D[0][2]) + D[1][1] * D[2][2] - SQR(D[1][2]));
 						     -8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz)
 							  + ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz))
 						     +1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_144;
 				D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
 						     -8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
 							  + ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
 						     +1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_144;
 				D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
 						     -8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
 							  + ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
 						     +1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_144;
 				(*pvar)(ix,iy,iz) =  ( D[0][0]*D[1][1] - SQR( D[0][1] )
 						       + D[0][0]*D[2][2] - SQR( D[0][2] )
 						       + D[1][1]*D[2][2] - SQR( D[1][2] ) );
 					}
 		}
 		else
 			throw std::runtime_error("compute_2LPT_source : invalid operator order specified");
 	}
 	//.. subtract global mean so the multi-grid poisson solver behaves well
@ -615,8 +396,6 @@ void compute_2LPT_source( const grid_hierarchy& u, grid_hierarchy& fnew, unsigne
 				for (int iz = 0; iz < nz; ++iz)
 					(*fnew.get_grid(i))(ix, iy, iz) -= sum;
 	}
 }
 #undef SQR
 #undef ACC
--- a/src/densities.cc
+++ b/src/densities.cc
@ -38,28 +38,15 @@ void fft_coarsen(m1 &v, m2 &V)
 	size_t nxf = v.size(0), nyf = v.size(1), nzf = v.size(2), nzfp = nzf + 2;
 	size_t nxF = V.size(0), nyF = V.size(1), nzF = V.size(2), nzFp = nzF + 2;
-	fftw_real *rcoarse = new fftw_real[nxF * nyF * nzFp];
+	real_t *rcoarse = new real_t[nxF * nyF * nzFp];
-	fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+	complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
-	fftw_real *rfine = new fftw_real[nxf * nyf * nzfp];
+	real_t *rfine = new real_t[nxf * nyf * nzfp];
-	fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+	complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
-#ifdef FFTW3
+	fftw_plan_t
-#ifdef SINGLE_PRECISION
+		pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-	fftwf_plan
+		ipc = FFTW_API(plan_dft_c2r_3d)(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
 		pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
 		ipc = fftwf_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
 #else
 	fftw_plan
 		pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
 		ipc = fftw_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
 #endif
 #else
 	rfftwnd_plan
 		pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
 		ipc = rfftw3d_create_plan(nxF, nyF, nzF, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxf; i++)
@ -70,19 +57,7 @@ void fft_coarsen(m1 &v, m2 &V)
 				rfine[q] = v(i, j, k);
 			}
-#ifdef FFTW3
+	FFTW_API(execute)(pf);
 #ifdef SINGLE_PRECISION
 	fftwf_execute(pf);
 #else
 	fftw_execute(pf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
 #else
 	rfftwnd_one_real_to_complex(pf, rfine, NULL);
 #endif
 #endif
 	double fftnorm = 1.0 / ((double)nxF * (double)nyF * (double)nzF);
@ -125,19 +100,7 @@ void fft_coarsen(m1 &v, m2 &V)
 	delete[] rfine;
-#ifdef FFTW3
+	FFTW_API(execute)(ipc);
 #ifdef SINGLE_PRECISION
 	fftwf_execute(ipc);
 #else
 	fftw_execute(ipc);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
 #else
 	rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
 #endif
 #endif
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxF; i++)
@ -150,18 +113,8 @@ void fft_coarsen(m1 &v, m2 &V)
 	delete[] rcoarse;
-#ifdef FFTW3
+	FFTW_API(destroy_plan)(pf);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(ipc);
 	fftwf_destroy_plan(pf);
 	fftwf_destroy_plan(ipc);
 #else
 	fftw_destroy_plan(pf);
 	fftw_destroy_plan(ipc);
 #endif
 #else
 	rfftwnd_destroy_plan(pf);
 	rfftwnd_destroy_plan(ipc);
 #endif
 }
 template <typename m1, typename m2>
@ -191,14 +144,14 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 	size_t nxc = nxf / 2, nyc = nyf / 2, nzc = nzf / 2, nzcp = nzf / 2 + 2;
-	fftw_real *rcoarse = new fftw_real[nxc * nyc * nzcp];
+	real_t *rcoarse = new real_t[nxc * nyc * nzcp];
-	fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+	complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
-	fftw_real *rfine = new fftw_real[nxf * nyf * nzfp];
+	real_t *rfine = new real_t[nxf * nyf * nzfp];
-	fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+	complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
 	// copy coarse data to rcoarse[.]
-	memset(rcoarse, 0, sizeof(fftw_real) * nxc * nyc * nzcp);
+	memset(rcoarse, 0, sizeof(real_t) * nxc * nyc * nzcp);
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxc; ++i)
@ -221,36 +174,13 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 				rfine[q] = v(i, j, k);
 			}
 #ifdef FFTW3
 #ifdef SINGLE_PRECISION
 	fftwf_plan
 		pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
 		pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
 		ipf = fftwf_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
 	fftwf_execute(pc);
 	fftwf_execute(pf);
 #else
 	fftw_plan
 		pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
 		pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
 		ipf = fftw_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
 	fftw_execute(pc);
 	fftw_execute(pf);
 #endif
 #else
 	rfftwnd_plan
 		pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
 		pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
 		ipf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#ifndef SINGLETHREAD_FFTW
+	fftw_plan_t
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL);
+		pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
+		pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-#else
+		ipf = FFTW_API(plan_dft_c2r_3d)(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
-	rfftwnd_one_real_to_complex(pc, rcoarse, NULL);
+	FFTW_API(execute)(pc);
-	rfftwnd_one_real_to_complex(pf, rfine, NULL);
+	FFTW_API(execute)(pf);
 #endif
 #endif
 	/*************************************************/
 	//.. perform actual interpolation
@ -300,28 +230,11 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 	/*************************************************/
-#ifdef FFTW3
+	FFTW_API(execute)(ipf);
-#ifdef SINGLE_PRECISION
+
-	fftwf_execute(ipf);
+	FFTW_API(destroy_plan)(pf);
-	fftwf_destroy_plan(pf);
+	FFTW_API(destroy_plan)(pc);
-	fftwf_destroy_plan(pc);
+	FFTW_API(destroy_plan)(ipf);
 	fftwf_destroy_plan(ipf);
 #else
 	fftw_execute(ipf);
 	fftw_destroy_plan(pf);
 	fftw_destroy_plan(pc);
 	fftw_destroy_plan(ipf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
 #else
 	rfftwnd_one_complex_to_real(ipf, cfine, NULL);
 #endif
 	fftwnd_destroy_plan(pf);
 	fftwnd_destroy_plan(pc);
 	fftwnd_destroy_plan(ipf);
 #endif
 // copy back and normalize
 #pragma omp parallel for
@ -349,8 +262,6 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	levelmin = cf.get_value_safe<unsigned>("setup", "levelmin_TF", levelminPoisson);
 	levelmax = cf.get_value<unsigned>("setup", "levelmax");
 	bool kspace = cf.get_value<bool>("setup", "kspace_TF");
 	bool fix  = cf.get_value_safe<bool>("setup","fix_mode_amplitude",false);
 	bool flip = cf.get_value_safe<bool>("setup","flip_mode_amplitude",false);
@ -360,31 +271,11 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	music::ulog.Print("Running unigrid density convolution...");
 	//... select the transfer function to be used
-	convolution::kernel_creator *the_kernel_creator;
+	convolution::kernel_creator *the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k"];
 	if (kspace)
 	{
 	std::cout << " - Using k-space transfer function kernel.\n";
 	music::ulog.Print("Using k-space transfer function kernel.");
 #ifdef SINGLE_PRECISION
 		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
 #else
 		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
 #endif
 	}
 	else
 	{
 		std::cout << " - Using real-space transfer function kernel.\n";
 		music::ulog.Print("Using real-space transfer function kernel.");
 #ifdef SINGLE_PRECISION
 		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_float"];
 #else
 		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_double"];
 #endif
 	}
 	//... initialize convolution kernel
 	convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
@ -402,7 +293,7 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	the_tf_kernel->fetch_kernel(levelmin, false);
 	//... perform convolution
-	convolution::perform<real_t>(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
+	convolution::perform(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
 	//... clean up kernel
 	delete the_tf_kernel;
@ -451,17 +342,11 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 	unsigned nbase = 1 << levelmin;
-	convolution::kernel_creator *the_kernel_creator;
+	convolution::kernel_creator *the_kernel_creator  = convolution::get_kernel_map()["tf_kernel_k"];
 	std::cout << " - Using k-space transfer function kernel.\n";
 	music::ulog.Print("Using k-space transfer function kernel.");
 #ifdef SINGLE_PRECISION
 	the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
 #else
 	the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
 #endif
 	convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
 	/***** PERFORM CONVOLUTIONS *****/
@ -475,7 +360,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 		top = new DensityGrid<real_t>(nbase, nbase, nbase);
 		music::ilog.Print("Performing noise convolution on level %3d", levelmin);
 		rand.load(*top, levelmin);
-		convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
+		convolution::perform(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
 		delta.create_base_hierarchy(levelmin);
 		top->copy(*delta.get_grid(levelmin));
@ -506,7 +391,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 			// load white noise for patch
 			rand.load(*fine, levelmin + i);
-			convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin + i, true),
+			convolution::perform(the_tf_kernel->fetch_kernel(levelmin + i, true),
 										 reinterpret_cast<void *>(fine->get_data_ptr()), shift, fix, flip);
 			if( fourier_splicing ){
--- a/src/fd_schemes.hh
+++ b/src/fd_schemes.hh
@ -11,12 +11,13 @@
 #ifndef __FD_SCHEMES_HH
 #define __FD_SCHEMES_HH
 #include <general.hh>
 #include <vector>
 #include <stdexcept>
 //! abstract implementation of the Poisson/Force scheme
-template< class L, class G, typename real_t=double >
+template< class L, class G>
 class scheme
 {
 public:
@ -57,10 +58,9 @@ public:
 };
 //! base class for finite difference gradients
-template< int nextent, typename T >
+template< int nextent>
 class gradient
 {
 	typedef T real_t;
 	std::vector<real_t> m_stencil;
 	const unsigned nl;
 public:
@ -110,20 +110,21 @@ public:
 };
 //! base class for finite difference stencils
-template< int nextent, typename real_t >
+template< int nextent>
 class base_stencil
 {
 protected:
-	std::vector<real_t> m_stencil;
+	static constexpr size_t nl{2*nextent+1};
-	const unsigned nl;
+	std::array<real_t,nl*nl*nl> m_stencil;
 public:
 	bool m_modsource;
 public:
-	base_stencil( bool amodsource = false )
+	explicit base_stencil( bool amodsource = false )
-	: nl( 2*nextent+1 ), m_modsource( amodsource )
+	: m_modsource( amodsource )
 	{
-		m_stencil.assign(nl*nl*nl,(real_t)0.0);
+		m_stencil.fill( (real_t)0.0 );
 	}
 	real_t& operator()(int i, int j, int k)
@ -176,8 +177,7 @@ public:
 //... Implementation of the Gradient schemes............................................
-template< typename real_t >
+class deriv_2P : public gradient<1>
 class deriv_2P : public gradient<1,real_t>
 {
 public:
@ -194,8 +194,7 @@ public:
 //... Implementation of the Laplacian schemes..........................................
 //! 7-point, 2nd order finite difference Laplacian
-template< typename real_t >
+class stencil_7P : public base_stencil<1>
 class stencil_7P : public base_stencil<1,real_t>
 {
 public:
@ -214,7 +213,7 @@ public:
 	inline real_t apply( const C& c, const int i, const int j, const int k ) const
 	{
 		//return c(i-1,j,k)+c(i+1,j,k)+c(i,j-1,k)+c(i,j+1,k)+c(i,j,k-1)+c(i,j,k+1)-6.0*c(i,j,k);
-		return (double)c(i-1,j,k)+(double)c(i+1,j,k)+(double)c(i,j-1,k)+(double)c(i,j+1,k)+(double)c(i,j,k-1)+(double)c(i,j,k+1)-6.0*(double)c(i,j,k);
+		return (real_t)c(i-1,j,k)+(real_t)c(i+1,j,k)+(real_t)c(i,j-1,k)+(real_t)c(i,j+1,k)+(real_t)c(i,j,k-1)+(real_t)c(i,j,k+1)-6.0*(real_t)c(i,j,k);
 	}
 	template< class C >
@ -230,8 +229,7 @@ public:
 };
 //! 13-point, 4th order finite difference Laplacian
-template< typename real_t >
+class stencil_13P : public base_stencil<2>
 class stencil_13P : public base_stencil<2,real_t>
 {
 public:
@ -279,8 +277,7 @@ public:
 //! 19-point, 6th order finite difference Laplacian
-template< typename real_t >
+class stencil_19P : public base_stencil<3>
 class stencil_19P : public base_stencil<3,real_t>
 {
 public:
@ -339,7 +336,6 @@ public:
 //! flux operator for the 4th order FD Laplacian
 template< typename real_t >
 class Laplace_flux_O4
 {
 public:
@ -354,7 +350,7 @@ public:
 	template< class C >
 	inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i-2,j,k)+15.0*c(i-1,j,k)-15.0*c(i,j,k)+c(i+1,j,k));
 	}
@ -369,7 +365,7 @@ public:
 	template< class C >
 	inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i,j-2,k)+15.0*c(i,j-1,k)-15.0*c(i,j,k)+c(i,j+1,k));
 	}
@ -384,7 +380,7 @@ public:
 	template< class C >
 	inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i,j,k-2)+15.0*c(i,j,k-1)-15.0*c(i,j,k)+c(i,j,k+1));
 	}
@ -392,7 +388,6 @@ public:
 //! flux operator for the 6th order FD Laplacian
 template< typename real_t >
 class Laplace_flux_O6
 {
 public:
@ -408,7 +403,7 @@ public:
 	template< class C >
 	inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i-3,j,k)-25.*c(i-2,j,k)+245.*c(i-1,j,k)-245.0*c(i,j,k)+25.*c(i+1,j,k)-2.*c(i+2,j,k));
 	}
@ -423,7 +418,7 @@ public:
 	template< class C >
 	inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i,j-3,k)-25.*c(i,j-2,k)+245.*c(i,j-1,k)-245.0*c(i,j,k)+25.*c(i,j+1,k)-2.*c(i,j+2,k));
 	}
@ -438,7 +433,7 @@ public:
 	template< class C >
 	inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i,j,k-3)-25.*c(i,j,k-2)+245.*c(i,j,k-1)-245.0*c(i,j,k)+25.*c(i,j,k+1)-2.*c(i,j,k+2));
 	}
--- a/src/general.hh
+++ b/src/general.hh
@ -8,75 +8,56 @@
 */
-#ifndef __GENERAL_HH
+#pragma once
 #define __GENERAL_HH
-#include "logger.hh"
+#include <logger.hh>
 #include <config_file.hh>
 #include <cassert>
-#include "omp.h"
+#include <omp.h>
 #ifdef WITH_MPI
  #ifdef MANNO
    #include <mpi.h>
  #else
    #include <mpi++.h>
  #endif
 #else
 #include <time.h>
 #endif
 #ifdef FFTW3
 #include <fftw3.h>
-	#if defined(SINGLE_PRECISION)
+
-	typedef float fftw_real;
+// include CMake controlled configuration settings
-	#else
+#include "cmake_config.hh"
-	typedef double fftw_real;
+
 #if defined(USE_PRECISION_FLOAT)
  using real_t = float;
  using complex_t = fftwf_complex;
  #define FFTW_PREFIX fftwf
 #elif defined(USE_PRECISION_DOUBLE)
  using real_t = double;
  using complex_t = fftw_complex;
  #define FFTW_PREFIX fftw
 #elif defined(USE_PRECISION_LONGDOUBLE)
  using real_t = long double;
  using complex_t = fftwl_complex;
  #define FFTW_PREFIX fftwl
 #endif
-#else
+#define FFTW_GEN_NAME_PRIM(a, b) a##_##b
-	#if defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
+#define FFTW_GEN_NAME(a, b) FFTW_GEN_NAME_PRIM(a, b)
-	#include <srfftw.h>
+#define FFTW_API(x) FFTW_GEN_NAME(FFTW_PREFIX, x)
 	#include <srfftw_threads.h>
 	#elif defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
 	#include <srfftw.h>
 	#elif not defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
 	#include <drfftw.h>
 	#include <drfftw_threads.h>
 	#elif not defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
 	#include <drfftw.h>
 	#endif
 #endif
-#ifdef SINGLE_PRECISION
+using fftw_plan_t = FFTW_GEN_NAME(FFTW_PREFIX, plan);
 	typedef float real_t;
 #else
 	typedef double real_t;
 #endif
 #define RE(x) ((x)[0])
 #define IM(x) ((x)[1])
 #include <vector>
 #include <array>
 using vec3_t = std::array<real_t,3>;
-#ifdef FFTW3
+namespace CONFIG
-	#define RE(x) ((x)[0])
+{
-	#define IM(x) ((x)[1])
+// extern int MPI_thread_support;
-#else
+// extern int MPI_task_rank;
-	#define RE(x) ((x).re)
+// extern int MPI_task_size;
-	#define IM(x) ((x).im)
+// extern bool MPI_ok;
-#endif
+// extern bool MPI_threads_ok;
-
+extern bool FFTW_threads_ok;
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
+extern int num_threads;
-#define fftw_complex fftwf_complex
+} // namespace CONFIG
 #endif
 #include <vector>
 #include "config_file.hh"
 //#include "mesh.hh"
 //! compute square of argument
 template< typename T >
@ -180,6 +161,3 @@ inline bool is_number(const std::string& s)
 	return true;
 }
 #endif
--- a/src/main.cc
+++ b/src/main.cc
@ -13,6 +13,8 @@
 #include <iomanip>
 #include <math.h>
 #include <thread>
 #include <gsl/gsl_rng.h>
 #include <gsl/gsl_randist.h>
 #include <gsl/gsl_integration.h>
@ -26,25 +28,40 @@ extern "C"
 }
 #endif
-#include "general.hh"
+#include <exception>
-#include "defaults.hh"
+#include <cfenv>
 #include "output.hh"
-#include "config_file.hh"
+#include <general.hh>
 #include <defaults.hh>
 #include <output.hh>
-#include "poisson.hh"
+#include <config_file.hh>
 #include "mg_solver.hh"
 #include "fd_schemes.hh"
 #include "random.hh"
 #include "densities.hh"
-#include "convolution_kernel.hh"
+#include <poisson.hh>
-#include "cosmology.hh"
+#include <mg_solver.hh>
-#include "transfer_function.hh"
+#include <fd_schemes.hh>
 #include <random.hh>
 #include <densities.hh>
 #include <convolution_kernel.hh>
 #include <cosmology.hh>
 #include <transfer_function.hh>
 #define THE_CODE_NAME "music!"
 #define THE_CODE_VERSION "2.0a"
 // initialise with "default" values
 namespace CONFIG{
 // int  MPI_thread_support = -1;
 // int  MPI_task_rank = 0;
 // int  MPI_task_size = 1;
 // bool MPI_ok = false;
 // bool MPI_threads_ok = false;
 bool FFTW_threads_ok = false;
 int  num_threads = 1;
 }
 namespace music
 {
@ -87,11 +104,6 @@ void splash(void)
 #if defined(CMAKE_BUILD)
 	music::ilog.Print("Version built from git rev.: %s, tag: %s, branch: %s", GIT_REV, GIT_TAG, GIT_BRANCH);
 #endif
 #if defined(SINGLE_PRECISION)
 	music::ilog.Print("Version was compiled for single precision.");
 #else
 	music::ilog.Print("Version was compiled for double precision.");
 #endif
 	std::cout << "\n\n";
 }
@ -294,6 +306,50 @@ void add_constant_value( grid_hierarchy &u, const double val )
 	}
 }
 #include <system_stat.hh>
 void output_system_info()
 {
 	std::feclearexcept(FE_ALL_EXCEPT);
 	//------------------------------------------------------------------------------
 	// Write code configuration to screen
 	//------------------------------------------------------------------------------
 	// hardware related infos
 	music::ilog << std::setw(32) << std::left << "CPU vendor string" << " : " << SystemStat::Cpu().get_CPUstring() << std::endl;
 	// multi-threading related infos
 	music::ilog << std::setw(32) << std::left << "Available HW threads / task" << " : " << std::thread::hardware_concurrency() << " (" << CONFIG::num_threads << " used)" << std::endl;
 	// memory related infos
 	SystemStat::Memory mem;
 	unsigned availpmem = mem.get_AvailMem()/1024/1024;
 	unsigned usedpmem = mem.get_UsedMem()/1024/1024;
 	unsigned maxpmem = availpmem, minpmem = availpmem;
 	unsigned maxupmem = usedpmem, minupmem = usedpmem;
 	music::ilog << std::setw(32) << std::left << "Total system memory (phys)" << " : " << mem.get_TotalMem()/1024/1024 << " Mb" << std::endl;
 	music::ilog << std::setw(32) << std::left << "Used system memory (phys)" << " : " << "Max: " << maxupmem << " Mb, Min: " << minupmem << " Mb" << std::endl;
 	music::ilog << std::setw(32) << std::left << "Available system memory (phys)" << " : " <<  "Max: " << maxpmem << " Mb, Min: " << minpmem << " Mb" << std::endl;
 	// Kernel related infos
 	SystemStat::Kernel kern;
 	auto kinfo = kern.get_kernel_info();
 	music::ilog << std::setw(32) << std::left << "OS/Kernel version" << " : " << kinfo.kernel << " version " << kinfo.major << "." << kinfo.minor << " build " << kinfo.build_number << std::endl;
 	// FFTW related infos
 	music::ilog << std::setw(32) << std::left << "FFTW version" << " : " << FFTW_API(version) << std::endl;
 	music::ilog << std::setw(32) << std::left << "FFTW supports multi-threading" << " : " << (CONFIG::FFTW_threads_ok? "yes" : "no") << std::endl;
 	music::ilog << std::setw(32) << std::left << "FFTW mode" << " : ";
 #if defined(FFTW_MODE_PATIENT)
 	music::ilog << "FFTW_PATIENT" << std::endl;
 #elif defined(FFTW_MODE_MEASURE)
    music::ilog << "FFTW_MEASURE" << std::endl;
 #else
 	music::ilog << "FFTW_ESTIMATE" << std::endl;
 #endif
 }
 /*****************************************************************************************************/
 /*****************************************************************************************************/
 /*****************************************************************************************************/
@ -342,25 +398,6 @@ int main(int argc, const char *argv[])
 	music::ulog.Print("Running %s, version %s", THE_CODE_NAME, THE_CODE_VERSION);
 	music::ulog.Print("Log is for run started %s", asctime(localtime(&ltime)));
 #ifdef FFTW3
 	music::ulog.Print("Code was compiled using FFTW version 3.x");
 #else
 	music::ulog.Print("Code was compiled using FFTW version 2.x");
 #endif
 #ifdef SINGLETHREAD_FFTW
 	music::ulog.Print("Code was compiled for single-threaded FFTW");
 #else
 	music::ulog.Print("Code was compiled for multi-threaded FFTW");
 	music::ulog.Print("Running with a maximum of %d OpenMP threads", omp_get_max_threads());
 #endif
 #ifdef SINGLE_PRECISION
 	music::ulog.Print("Code was compiled for single precision.");
 #else
 	music::ulog.Print("Code was compiled for double precision.");
 #endif
 	//------------------------------------------------------------------------------
 	//... read and interpret config file
 	//------------------------------------------------------------------------------
@ -369,6 +406,13 @@ int main(int argc, const char *argv[])
 	bool force_shift(false);
 	double boxlength;
 	//------------------------------------------------------------------------------
 	//... init multi-threading
 	//------------------------------------------------------------------------------
 	CONFIG::FFTW_threads_ok = FFTW_API(init_threads)();
 	CONFIG::num_threads = cf.get_value_safe<unsigned>("execution", "NumThreads",std::thread::hardware_concurrency());
 	//------------------------------------------------------------------------------
 	//... initialize some parameters about grid set-up
 	//------------------------------------------------------------------------------
@ -403,24 +447,6 @@ int main(int argc, const char *argv[])
 	else
 		music::ilog.Print("Using real space sampled transfer functions...");
 		//------------------------------------------------------------------------------
 		//... initialize multithread FFTW
 		//------------------------------------------------------------------------------
 #if not defined(SINGLETHREAD_FFTW)
 #ifdef FFTW3
 #ifdef SINGLE_PRECISION
 	fftwf_init_threads();
 	fftwf_plan_with_nthreads(omp_get_max_threads());
 #else
 	fftw_init_threads();
 	fftw_plan_with_nthreads(omp_get_max_threads());
 #endif
 #else
 	fftw_threads_init();
 #endif
 #endif
 	//------------------------------------------------------------------------------
 	//... initialize cosmology
 	//------------------------------------------------------------------------------
@ -1373,13 +1399,8 @@ int main(int argc, const char *argv[])
 	delete the_transfer_function_plugin;
 	delete the_poisson_solver;
-#if defined(FFTW3) and not defined(SINGLETHREAD_FFTW)
+	if( CONFIG::FFTW_threads_ok )
-#ifdef SINGLE_PRECISION
+		FFTW_API(cleanup_threads)();
 	fftwf_cleanup_threads();
 #else
 	fftw_cleanup_threads();
 #endif
 #endif
 	//------------------------------------------------------------------------------
 	//... we are done !
--- a/src/mg_interp.hh
+++ b/src/mg_interp.hh
@ -290,12 +290,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -312,12 +312,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -338,12 +338,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -359,12 +359,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -384,12 +384,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -405,12 +405,12 @@ struct cubic_interp
 							{
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
-								coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								dflux = coarse_flux - fine_flux;
@ -717,13 +717,13 @@ struct interp_O5_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
 							fine_flux /= 4.0;
-							coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 							dflux = coarse_flux - fine_flux;
@ -758,12 +758,12 @@ struct interp_O5_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -798,12 +798,12 @@ struct interp_O5_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -838,12 +838,12 @@ struct interp_O5_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
-							coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -880,12 +880,12 @@ struct interp_O5_fluxcorr
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -920,12 +920,12 @@ struct interp_O5_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
-							coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -1027,13 +1027,13 @@ struct interp_O7_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz+1);
 							fine_flux /= 4.0;
-							coarse_flux = Laplace_flux_O6<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 							dflux = coarse_flux - fine_flux;
@ -1074,12 +1074,12 @@ struct interp_O7_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O6<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -1119,12 +1119,12 @@ struct interp_O7_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O6<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -1164,12 +1164,12 @@ struct interp_O7_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz+1);
-							coarse_flux = Laplace_flux_O6<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -1210,12 +1210,12 @@ struct interp_O7_fluxcorr
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy+1,iz+1);
-							coarse_flux = Laplace_flux_O6<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
@ -1255,12 +1255,12 @@ struct interp_O7_fluxcorr
 								}
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy+1,iz);
-							coarse_flux = Laplace_flux_O6<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							dflux = coarse_flux - fine_flux;
--- a/src/mg_solver.hh
+++ b/src/mg_solver.hh
@ -8,30 +8,36 @@
 */
-#ifndef __MG_SOLVER_HH
+#pragma once
 #define __MG_SOLVER_HH
 #include <cmath>
 #include <iostream>
-#include "mg_operators.hh"
+#include <mg_operators.hh>
-#include "mg_interp.hh"
+#include <mg_interp.hh>
-#include "mesh.hh"
+#include <mesh.hh>
-#define BEGIN_MULTIGRID_NAMESPACE namespace multigrid {
+#define BEGIN_MULTIGRID_NAMESPACE \
 	namespace multigrid             \
 	{
 #define END_MULTIGRID_NAMESPACE }
 BEGIN_MULTIGRID_NAMESPACE
 //! options for multigrid smoothing operation
-namespace opt {
+namespace opt
-	enum smtype { sm_jacobi, sm_gauss_seidel, sm_sor };
+{
 	enum smtype
 	{
 		sm_jacobi,
 		sm_gauss_seidel,
 		sm_sor
 	};
 }
 //! actual implementation of FAS adaptive multigrid solver
-template< class S, class I, class O, typename T=double >
+template <class S, class I, class O>
 class solver
 {
 public:
@ -52,31 +58,31 @@ protected:
 	std::vector<double> m_residu_ini; //!< vector of initial residuals for each level
 	bool m_is_ini;										//!< bool that is true for first iteration
-	GridHierarchy<T>	*m_pu,					//!< pointer to GridHierarchy for solution u
+	GridHierarchy<real_t>
 			*m_pu,		 //!< pointer to GridHierarchy for solution u
 			*m_pf,		 //!< pointer to GridHierarchy for right-hand-side
 			*m_pfsave; //!< pointer to saved state of right-hand-side (unused)
-	const MeshvarBnd<T> *m_pubnd;
+	const MeshvarBnd<real_t> *m_pubnd;
 	//! compute residual for a level
-  double compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& unew, int ilevel );
+	double compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &unew, int ilevel);
 	//! compute residuals for entire grid hierarchy
-	double compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& uhnew, bool verbose );
+	double compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &uhnew, bool verbose);
 	//! compute residuals for entire grid hierarchy
-	double compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose );
+	double compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose);
 protected:
 	//! Jacobi smoothing
-	void Jacobi( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
+	void Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
 	//! Gauss-Seidel smoothing
-	void GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
+	void GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
 	//! Successive-Overrelaxation smoothing
-	void SOR( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
+	void SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
 	//! main two-grid (V-cycle) for multi-grid iterations
 	void twoGrid(unsigned ilevel);
@ -85,44 +91,39 @@ protected:
 	void setBC(unsigned ilevel);
 	//! make top grid periodic boundary conditions
-	void make_periodic( MeshvarBnd<T> *u );
+	void make_periodic(MeshvarBnd<real_t> *u);
-	//void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine );
+	// void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<real_t>& coarse, MeshvarBnd<real_t>& fine );
 public:
 	//! constructor
-	solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth );
+	solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth);
 	//! destructor
 	~solver()
-	{  }
+	{
 	}
 	//! solve Poisson's equation
-	double solve( GridHierarchy<T>& u, double accuracy, double h=-1.0, bool verbose=false );
+	double solve(GridHierarchy<real_t> &u, double accuracy, double h = -1.0, bool verbose = false);
 	//! solve Poisson's equation
-	double solve( GridHierarchy<T>& u, double accuracy, bool verbose=false )
+	double solve(GridHierarchy<real_t> &u, double accuracy, bool verbose = false)
 	{
 		return this->solve(u, accuracy, -1.0, verbose);
 	}
 };
-
+template <class S, class I, class O>
-template< class S, class I, class O, typename T >
+solver<S, I, O>::solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth)
 solver<S,I,O,T>::solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth )
 		: m_scheme(), m_gridop(), m_npresmooth(npresmooth), m_npostsmooth(npostsmooth),
 			m_smoother(smoother), m_ilevelmin(f.levelmin()), m_is_ini(true), m_pf(&f)
 {
 	m_is_ini = true;
 }
-
+template <class S, class I, class O>
-template< class S, class I, class O, typename T >
+void solver<S, I, O>::Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 void solver<S,I,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
 {
 	int
 			nx = u->size(0),
@ -133,7 +134,7 @@ void solver<S,I,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
 			c0 = -1.0 / m_scheme.ccoeff(),
 			h2 = h * h;
-	MeshvarBnd<T> uold(*u);
+	MeshvarBnd<real_t> uold(*u);
 	double alpha = 0.95, ialpha = 1.0 - alpha;
@ -142,11 +143,10 @@ void solver<S,I,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
 		for (int iy = 0; iy < ny; ++iy)
 			for (int iz = 0; iz < nz; ++iz)
 				(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-void solver<S,I,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
+void solver<S, I, O>::SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
 			nx = u->size(0),
@ -157,7 +157,7 @@ void solver<S,I,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
 			c0 = -1.0 / m_scheme.ccoeff(),
 			h2 = h * h;
-	MeshvarBnd<T> uold(*u);
+	MeshvarBnd<real_t> uold(*u);
 	double
 			alpha = 1.2,
@ -171,27 +171,23 @@ void solver<S,I,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
 				if ((ix + iy + iz) % 2 == 0)
 					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 #pragma omp parallel for
 	for (int ix = 0; ix < nx; ++ix)
 		for (int iy = 0; iy < ny; ++iy)
 			for (int iz = 0; iz < nz; ++iz)
 				if ((ix + iy + iz) % 2 != 0)
 					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-void solver<S,I,O,T>::GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f )
+void solver<S, I, O>::GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
 			nx = u->size(0),
 			ny = u->size(1),
 			nz = u->size(2);
-	T
+	real_t
 			c0 = -1.0 / m_scheme.ccoeff(),
 			h2 = h * h;
@ -202,15 +198,12 @@ void solver<S,I,O,T>::GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f
 				for (int iz = 0; iz < nz; ++iz)
 					if ((ix + iy + iz) % 2 == color)
 						(*u)(ix, iy, iz) = (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
-
+template <class S, class I, class O>
-template< class S, class I, class O, typename T >
+void solver<S, I, O>::twoGrid(unsigned ilevel)
 void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 {
-	MeshvarBnd<T> *uf, *uc, *ff, *fc;
+	MeshvarBnd<real_t> *uf, *uc, *ff, *fc;
 	double
 			h = 1.0 / (1 << ilevel),
@ -223,7 +216,6 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 	uc = m_pu->get_grid(ilevel - 1);
 	fc = m_pf->get_grid(ilevel - 1);
 	int
 			nx = uf->size(0),
 			ny = uf->size(1),
@ -235,7 +227,8 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 		setBC(ilevel);
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npresmooth; ++i ){
+	for (unsigned i = 0; i < m_npresmooth; ++i)
 	{
 		if (ilevel > m_ilevelmin)
 			interp().interp_coarse_fine(ilevel, *uc, *uf);
@ -253,7 +246,6 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 			make_periodic(uf);
 	}
 	m_gridop.restrict(*uf, *uc);
 	//... essential!!
@ -262,7 +254,6 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 	else if (ilevel > m_ilevelmin)
 		interp().interp_coarse_fine(ilevel, *uc, *uf);
 	//....................................................................
 	//... we now use hard-coded restriction+operatore app, see below
 	/*meshvar_bnd Lu(*uf,false);
@ -294,18 +285,8 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 		int iix = 2 * ix;
 		for (int iy = 0, iiy = 0; iy < ny / 2; ++iy, iiy += 2)
 			for (int iz = 0, iiz = 0; iz < nz / 2; ++iz, iiz += 2)
-				tLu(ix+oxp,iy+oyp,iz+ozp) = 0.125 * (
+				tLu(ix + oxp, iy + oyp, iz + ozp) = 0.125 * (m_scheme.apply((*uf), iix, iiy, iiz) + m_scheme.apply((*uf), iix, iiy, iiz + 1) + m_scheme.apply((*uf), iix, iiy + 1, iiz) + m_scheme.apply((*uf), iix, iiy + 1, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy, iiz) + m_scheme.apply((*uf), iix + 1, iiy, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz + 1)) / h2;
 							 m_scheme.apply( (*uf), iix, iiy, iiz )
 							+m_scheme.apply( (*uf), iix, iiy, iiz+1 )
 							+m_scheme.apply( (*uf), iix, iiy+1, iiz )
 							+m_scheme.apply( (*uf), iix, iiy+1, iiz+1 )
 							+m_scheme.apply( (*uf), iix+1, iiy, iiz )
 							+m_scheme.apply( (*uf), iix+1, iiy, iiz+1 )
 							+m_scheme.apply( (*uf), iix+1, iiy+1, iiz )
 							+m_scheme.apply( (*uf), iix+1, iiy+1, iiz+1 )
 						)/h2;
 	}
 	//... restrict source term
@ -337,7 +318,6 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 	meshvar_bnd cc(*uc, false);
 //... compute correction on coarse grid
 #pragma omp parallel for
 	for (int ix = 0; ix < (int)cc.size(0); ++ix)
@ -359,7 +339,8 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 		setBC(ilevel);
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npostsmooth; ++i ){
+	for (unsigned i = 0; i < m_npostsmooth; ++i)
 	{
 		if (ilevel > m_ilevelmin)
 			interp().interp_coarse_fine(ilevel, *uc, *uf);
@ -375,13 +356,11 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 		if (m_bperiodic && ilevel <= m_ilevelmin)
 			make_periodic(uf);
-
+	}
 }
-}
+template <class S, class I, class O>
-
+double solver<S, I, O>::compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &f, int ilevel)
 template< class S, class I, class O, typename T >
 double solver<S,I,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& f, int ilevel )
 {
 	int
 			nx = u.size(0),
@ -393,7 +372,8 @@ double solver<S,I,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<
 	double h = 1.0 / (1ul << ilevel), h2 = h * h;
-	#pragma omp parallel for reduction(+:err,count)
+#pragma omp parallel for reduction(+ \
 																	 : err, count)
 	for (int ix = 0; ix < nx; ++ix)
 		for (int iy = 0; iy < ny; ++iy)
 			for (int iz = 0; iz < nz; ++iz)
@ -413,8 +393,8 @@ double solver<S,I,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<
 	return err;
 }
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-double solver<S,I,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose )
+double solver<S, I, O>::compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
 {
 	double maxerr = 0.0;
@ -430,7 +410,8 @@ double solver<S,I,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHie
 		double h = 1.0 / (1ul << ilevel), h2 = h * h;
-                #pragma omp parallel for reduction(+:err,count)
+#pragma omp parallel for reduction(+ \
 																	 : err, count)
 		for (int ix = 0; ix < nx; ++ix)
 			for (int iy = 0; iy < ny; ++iy)
 				for (int iz = 0; iz < nz; ++iz)
@ -457,13 +438,12 @@ double solver<S,I,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHie
 		music::dlog.Print("[mg]      level %3d,  residual %g,  rel. error %g", ilevel, mean_res, err);
 		maxerr = std::max(maxerr, err);
 	}
 	return maxerr;
 }
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose )
+double solver<S, I, O>::compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
 {
 	if (m_is_ini)
 		m_residu_ini.assign(uh.levelmax() + 1, 0.0);
@ -481,7 +461,8 @@ double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const Gri
 		double sum = 0.0, sumd2 = 0.0;
 		size_t count = 0;
-		#pragma omp parallel for reduction(+:sum,sumd2,count)
+#pragma omp parallel for reduction(+ \
 																	 : sum, sumd2, count)
 		for (int ix = 0; ix < nx; ++ix)
 			for (int iy = 0; iy < ny; ++iy)
 				for (int iz = 0; iz < nz; ++iz)
@ -508,7 +489,6 @@ double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const Gri
 		if (err_rel > maxerr)
 			maxerr = err_rel;
 	}
 	if (m_is_ini)
@ -517,9 +497,8 @@ double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const Gri
 	return maxerr;
 }
-
+template <class S, class I, class O>
-template< class S, class I, class O, typename T >
+double solver<S, I, O>::solve(GridHierarchy<real_t> &uh, double acc, double h, bool verbose)
 double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool verbose )
 {
 	double err, maxerr = 1e30;
@ -542,7 +521,8 @@ double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool
 		err = compute_error(*m_pu, *m_pf, fullverbose);
 		++niter;
-		if( fullverbose ){
+		if (fullverbose)
 		{
 			music::ulog.Print("  multigrid iteration %3d, maximum RMS residual = %g", niter, err);
 			std::cout << "   - Step No. " << std::setw(3) << niter << ", Max Err = " << err << std::endl;
 			std::cout << "     ---------------------------------------------------\n";
@ -566,25 +546,21 @@ double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool
 		music::ulog.Print("Poisson solver converged to max. error of %g in %d steps.", err, niter);
 	}
 	//.. make sure that the RHS does not contain the FAS corrections any more
 	for (int i = m_pf->levelmax(); i > 0; --i)
 		m_gridop.restrict(*m_pf->get_grid(i), *m_pf->get_grid(i - 1));
 	return err;
 }
 // TODO: this only works for 2nd order! (but actually not needed)
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-void solver<S,I,O,T>::setBC( unsigned ilevel )
+void solver<S, I, O>::setBC(unsigned ilevel)
 {
 	//... set only on level before additional refinement starts
 	if (ilevel == m_ilevelmin)
 	{
-		MeshvarBnd<T> *u = m_pu->get_grid(ilevel);
+		MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
 		int
 				nx = u->size(0),
 				ny = u->size(1),
@ -594,7 +570,8 @@ void solver<S,I,O,T>::setBC( unsigned ilevel )
 			for (int iz = 0; iz < nz; ++iz)
 			{
 				(*u)(-1, iy, iz) = 2.0 * (*m_pubnd)(-1, iy, iz) - (*u)(0, iy, iz);
-				(*u)(nx,iy,iz) = 2.0*(*m_pubnd)(nx,iy,iz) - (*u)(nx-1,iy,iz);;
+				(*u)(nx, iy, iz) = 2.0 * (*m_pubnd)(nx, iy, iz) - (*u)(nx - 1, iy, iz);
 				;
 			}
 		for (int ix = 0; ix < nx; ++ix)
@ -610,27 +587,20 @@ void solver<S,I,O,T>::setBC( unsigned ilevel )
 				(*u)(ix, iy, -1) = 2.0 * (*m_pubnd)(ix, iy, -1) - (*u)(ix, iy, 0);
 				(*u)(ix, iy, nz) = 2.0 * (*m_pubnd)(ix, iy, nz) - (*u)(ix, iy, nz - 1);
 			}
 	}
 }
 //... enforce periodic boundary conditions
-template< class S, class I, class O, typename T >
+template <class S, class I, class O>
-void solver<S,I,O,T>::make_periodic( MeshvarBnd<T> *u )
+void solver<S, I, O>::make_periodic(MeshvarBnd<real_t> *u)
 {
 	int
 			nx = u->size(0),
 			ny = u->size(1),
 			nz = u->size(2);
 	int nb = u->m_nbnd;
 	// if( u->offset(0) == 0 )
 	for (int iy = -nb; iy < ny + nb; ++iy)
 		for (int iz = -nb; iz < nz + nb; ++iz)
@ -642,7 +612,6 @@ void solver<S,I,O,T>::make_periodic( MeshvarBnd<T> *u )
 				(*u)(i, iy, iz) = (*u)(nx + i, iiy, iiz);
 				(*u)(nx - 1 - i, iy, iz) = (*u)(-1 - i, iiy, iiz);
 			}
 		}
 	// if( u->offset(1) == 0 )
@ -670,10 +639,8 @@ void solver<S,I,O,T>::make_periodic( MeshvarBnd<T> *u )
 				(*u)(ix, iy, nz - 1 - i) = (*u)(iix, iiy, -1 - i);
 			}
 		}
 }
 END_MULTIGRID_NAMESPACE
-#endif
+
--- a/src/plugins/output_enzo.cc
+++ b/src/plugins/output_enzo.cc
@ -230,19 +230,11 @@ protected:
 			HDFCreateFile(filename);
 			write_sim_header(filename, the_sim_header);
 #ifdef SINGLE_PRECISION
 			//... create full array in file
-			HDFHyperslabWriter3Ds<float> *slab_writer = new HDFHyperslabWriter3Ds<float>(filename, enzoname, nsz);
+			HDFHyperslabWriter3Ds<real_t> *slab_writer = new HDFHyperslabWriter3Ds<real_t>(filename, enzoname, nsz);
 			//... create buffer
-			float *data_buf = new float[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
+			real_t *data_buf = new real_t[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
 #else
 			//... create full array in file
 			HDFHyperslabWriter3Ds<double> *slab_writer = new HDFHyperslabWriter3Ds<double>(filename, enzoname, nsz);
 			//... create buffer
 			double *data_buf = new double[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
 #endif
 			//... write slice by slice
 			size_t slices_written = 0;
--- a/src/plugins/output_gadget2.cc
+++ b/src/plugins/output_gadget2.cc
@ -1390,7 +1390,5 @@ public:
 namespace
 {
 	output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2");
 #ifndef SINGLE_PRECISION
 	output_plugin_creator_concrete<gadget2_output_plugin<double>> creator2("gadget2_double");
 #endif
 }
--- a/src/plugins/output_gadget2_2comp.cc
+++ b/src/plugins/output_gadget2_2comp.cc
@ -18,11 +18,9 @@ template< typename T_store=float >
 class gadget2_2comp_output_plugin : public output_plugin
 {
 protected:
 	std::ofstream ofs_;
 	bool bmultimass_;
 	typedef struct io_header
 	{
 		unsigned int npart[6];
@ -47,7 +45,6 @@ protected:
 		char fill[52];
 	} header;
 	header header_;
 	std::string fname;
@ -55,8 +52,15 @@ protected:
 	bool do_glass_;
 	std::string fname_glass_baryon_, fname_glass_cdm_;
-	enum iofields {
+	enum iofields
-		id_dm_mass, id_dm_vel, id_dm_pos, id_gas_vel, id_gas_rho, id_gas_temp, id_gas_pos
+	{
 		id_dm_mass,
 		id_dm_vel,
 		id_dm_pos,
 		id_gas_vel,
 		id_gas_rho,
 		id_gas_temp,
 		id_gas_pos
 	};
 	size_t np_fine_gas_, np_fine_dm_, np_coarse_dm_;
@ -90,8 +94,9 @@ protected:
 				nf_dm_assigned += nfdm_pf[i];
 				nfgas_pf[i] = std::min(nnominal / 2ul, nfine_gas - nf_gas_assigned);
 				nf_gas_assigned += nfgas_pf[i];
-                
+			}
-            }else{
+			else
 			{
 				nfdm_pf[i] = std::min(nnominal, nfine_dm - nf_dm_assigned);
 				nf_dm_assigned += nfdm_pf[i];
 			}
@ -102,14 +107,12 @@ protected:
 				nc_pf[i] = std::min(nnominal - (size_t)(nfdm_pf[i] + nfgas_pf[i]), ncoarse - nc_assigned);
 				nc_assigned += nc_pf[i];
 			}
 		}
 		// make sure all particles are assigned
 		nfdm_pf[nfiles - 1] += nfine_dm - nf_dm_assigned;
 		nfgas_pf[nfiles - 1] += nfine_gas - nf_gas_assigned;
 		nc_pf[nfiles - 1] += ncoarse - nc_assigned;
 	}
 	std::ifstream &open_and_check(std::string ffname, size_t npart)
@ -156,7 +159,6 @@ protected:
 		pistream()
 		{
 		}
 		void open(std::string fname, size_t npart, size_t offset = 0)
@ -212,7 +214,6 @@ protected:
 		postream()
 		{
 		}
 		void open(std::string fname, size_t npart, size_t offset = 0)
@ -246,7 +247,6 @@ protected:
 	void assemble_gadget_file(void)
 	{
 		//............................................................................
 		//... copy from the temporary files, interleave the data and save ............
@ -311,11 +311,12 @@ protected:
 		distribute_particles(nfiles_, np_fine_dm_, np_fine_gas_, np_coarse_dm_,
 												 nfdm_per_file, nfgas_per_file, nc_per_file);
 		if (nfiles_ > 1)
 		{
 			std::cout << " - Gadget2 : distributing particles to " << nfiles_ << " files\n"
-			<< "                 " << std::setw(12) << "type 1" << "," << std::setw(12) << "type 2" << "," << std::setw(12) << "type 5" << std::endl;
+								<< "                 " << std::setw(12) << "type 1"
 								<< "," << std::setw(12) << "type 2"
 								<< "," << std::setw(12) << "type 5" << std::endl;
 			for (unsigned i = 0; i < nfiles_; ++i)
 			{
 				std::cout << "      file " << std::setw(3) << i << " : "
@ -325,7 +326,6 @@ protected:
 			}
 		}
 		size_t curr_block_buf_size = block_buf_size_;
 		size_t idcount = 0;
@ -336,7 +336,6 @@ protected:
 			music::wlog.Print("Need long particle IDs, make sure to enable in Gadget!");
 		}
 		for (unsigned ifile = 0; ifile < nfiles_; ++ifile)
 		{
@ -345,11 +344,12 @@ protected:
 				char ffname[256];
 				sprintf(ffname, "%s.%d", fname_.c_str(), ifile);
 				ofs_.open(ffname, std::ios::binary | std::ios::trunc);
-			}else{
+			}
 			else
 			{
 				ofs_.open(fname_.c_str(), std::ios::binary | std::ios::trunc);
 			}
 			size_t np_this_file = nfgas_per_file[ifile] + nfdm_per_file[ifile] + nc_per_file[ifile];
 			int blksize = sizeof(header);
@ -361,12 +361,10 @@ protected:
 			this_header.npart[2] = nfgas_per_file[ifile];
 			this_header.npart[5] = nc_per_file[ifile];
 			ofs_.write((char *)&blksize, sizeof(int));
 			ofs_.write((char *)&this_header, sizeof(header));
 			ofs_.write((char *)&blksize, sizeof(int));
 			//... particle positions ..................................................
 			blksize = 3ul * np_this_file * sizeof(T_store);
 			ofs_.write((char *)&blksize, sizeof(int));
@ -431,15 +429,10 @@ protected:
 				iffs1.close();
 				iffs2.close();
 				iffs3.close();
 			}
 			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
 			//... particle velocities ..................................................
 			blksize = 3ul * np_this_file * sizeof(T_store);
 			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
@ -505,8 +498,6 @@ protected:
 				iffs1.close();
 				iffs2.close();
 				iffs3.close();
 			}
 			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
@ -527,7 +518,6 @@ protected:
 			if (bneed_long_ids)
 				blksize = sizeof(size_t) * np_this_file;
 			//... generate contiguous IDs and store in file ..
 			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
 			while (n2read > 0ul)
@ -537,7 +527,9 @@ protected:
 					for (size_t i = 0; i < n2read; ++i)
 						long_ids[i] = idcount++;
 					ofs_.write(reinterpret_cast<char *>(&long_ids[0]), n2read * sizeof(size_t));
-                }else{
+				}
 				else
 				{
 					for (size_t i = 0; i < n2read; ++i)
 						short_ids[i] = idcount++;
 					ofs_.write(reinterpret_cast<char *>(&short_ids[0]), n2read * sizeof(unsigned));
@ -550,19 +542,15 @@ protected:
 			std::vector<unsigned>().swap(short_ids);
 			std::vector<size_t>().swap(long_ids);
 			//... particle masses .......................................................
 			// multi-mass not supported here
 			ofs_.flush();
 			ofs_.close();
 			wrote_gas += nfgas_per_file[ifile];
 			wrote_dm += nfdm_per_file[ifile] + nc_per_file[ifile];
 			wrote_coarse += nc_per_file[ifile];
 		}
 		delete[] tmp1;
@ -640,12 +628,9 @@ protected:
 			disp += f7 * (*gh.get_grid(levelmax_))(i1, j1, k);
 			disp += f8 * (*gh.get_grid(levelmax_))(i1, j1, k1);
 			disp = fmodf((1.0f + disp) * header_.BoxSize, header_.BoxSize);
 			valp[ip] = disp;
 		}
 	}
@ -708,13 +693,10 @@ protected:
 			vel *= vfac;
 			valp[ip] = vel;
 		}
 	}
 public:
 	bool do_baryons_;
 	double omegab_, omegac_, omegam_;
 	double gamma_;
@ -732,8 +714,6 @@ public:
 		nfiles_ = cf.get_value_safe<unsigned>("output", "gadget_num_files", 1);
 		/****************************************/
 		if (nfiles_ > 1)
 		{
@ -749,7 +729,9 @@ public:
 				}
 				ofs_.close();
 			}
-		}else{
+		}
 		else
 		{
 			ofs_.open(fname_.c_str(), std::ios::binary | std::ios::trunc);
 			if (!ofs_.good())
 			{
@ -767,7 +749,6 @@ public:
 		if (levelmax_ == levelmin_)
 			bmultimass_ = false;
 		for (int i = 0; i < 6; ++i)
 		{
 			header_.npart[i] = 0;
@ -796,7 +777,6 @@ public:
 				fname_glass_baryon_ = fname_glass_cdm_; // cf.get_value<std::string>("output","glass_file_baryon");
 		}
 		//... set time ......................................................
 		header_.redshift = cf.get_value<double>("setup", "zstart");
 		header_.time = 1.0 / (1.0 + header_.redshift);
@ -819,13 +799,9 @@ public:
 		header_.flag_stellarage = 0;
 		header_.flag_metals = 0;
 		header_.flag_entropy_instead_u = 0;
-#ifdef SINGLE_PRECISION
+
-                header_.flag_doubleprecision = 0;
+		header_.flag_doubleprecision = typeid(real_t) == typeid(double) ? 1 : 0;
 #else
                header_.flag_doubleprecision = 1; 
 #endif
 		std::cout << "header_.flag_doubleprecision " << header_.flag_doubleprecision << "\n";
 		header_.flag_ic_info = 0;
@ -833,7 +809,6 @@ public:
 			header_.BoxSize *= 1000.0;
 	}
 	void write_dm_mass(const grid_hierarchy &gh)
 	{
 		double rhoc = 27.7519737; // in h^2 1e10 M_sol / Mpc^3
@ -904,7 +879,6 @@ public:
 			if (ofs_temp.bad())
 				throw std::runtime_error("I/O error while writing temporary file for masses");
 		}
 		else if (gh.levelmax() != gh.levelmin())
 		{
@ -912,7 +886,6 @@ public:
 		}
 	}
 	void write_dm_position(int coord, const grid_hierarchy &gh)
 	{
 		//... count number of leaf cells ...//
@ -922,7 +895,6 @@ public:
 		if (bmultimass_)
 			npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax() - 1);
 		np_fine_dm_ = npfine;
 		np_fine_gas_ = do_baryons_ ? npfine : 0ul;
 		np_coarse_dm_ = npcoarse;
@ -971,7 +943,6 @@ public:
 		if (do_baryons_ && !do_glass_)
 		{
 			double h = 1. / (1 << gh.levelmax());
 			if (shift == NULL)
@ -980,7 +951,6 @@ public:
 				shift[0] = 0.0;
 				shift[1] = 0.0;
 				shift[2] = 0.0;
 			}
 			shift[0] -= 0.5 * h * omegab_ / omegam_;
 			shift[1] -= 0.5 * h * omegab_ / omegam_;
@ -1044,10 +1014,8 @@ public:
 			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
 			// assert( blksz == glasshead.npart[1]*sizeof(float)*3 );
 			float lglass = glasshead.BoxSize;
 			blksize = sizeof(T_store) * glasshead.npart[1];
 			// ofs_temp.write( (char *)&blksize, sizeof(int) );
 			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
@ -1089,7 +1057,6 @@ public:
 				npartdone += npart2read;
 				nwritten += npart2read;
 			}
 			delete[] pos_tmp;
@ -1119,11 +1086,8 @@ public:
 									temp_data.push_back(xx[coord]);
 								}
 							}
 		}
 		if (temp_data.size() > 0)
 		{
 			ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * temp_data.size());
@ -1143,7 +1107,6 @@ public:
 		if (shift != NULL)
 			delete[] shift;
 	}
 	void write_dm_velocity(int coord, const grid_hierarchy &gh)
@ -1181,7 +1144,6 @@ public:
 		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_dm_vel + coord);
 		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
 		if (!do_glass_)
 		{
@ -1203,7 +1165,6 @@ public:
 									temp_data.clear();
 									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
 							}
 		}
 		else
@ -1251,7 +1212,6 @@ public:
 				npartdone += npart2read;
 				nwritten += npart2read;
 			}
 			delete[] pos_tmp;
@ -1272,10 +1232,7 @@ public:
 									temp_data.clear();
 									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
 							}
 		}
 		if (temp_data.size() > 0)
@ -1301,12 +1258,12 @@ public:
 	}
 	void write_dm_potential(const grid_hierarchy &gh)
-	{ }
+	{
 	}
 	void write_gas_potential(const grid_hierarchy &gh)
-	{ }
+	{
-	
+	}
 	//... write data for gas -- don't do this
 	void write_gas_velocity(int coord, const grid_hierarchy &gh)
@ -1340,13 +1297,11 @@ public:
 		unsigned long long blksize;
 		if (!do_glass_)
 		{
 			blksize = sizeof(T_store) * npart;
 			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
 			const unsigned ilevel = gh.levelmax();
 			const unsigned
 					nx = gh.get_grid(ilevel)->size(0),
@ -1368,9 +1323,10 @@ public:
 							temp_data.clear();
 							temp_data.push_back(v * vfac);
 						}
 					}
-		}else{
+		}
 		else
 		{
 			std::ifstream ofg(fname_glass_baryon_.c_str(), std::ios::binary);
@ -1408,7 +1364,6 @@ public:
 			size_t npartdone = 0;
 			size_t npinter = (size_t)glasshead.npart[2];
 			blksize = sizeof(T_store) * npinter;
 			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
@ -1445,7 +1400,6 @@ public:
 									temp_data.clear();
 									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
 							}
 		}
@ -1466,7 +1420,6 @@ public:
 		ofs_temp.close();
 	}
 	//... write only for fine level
 	void write_gas_position(int coord, const grid_hierarchy &gh)
 	{
@ -1505,7 +1458,6 @@ public:
 		std::vector<T_store> temp_data;
 		temp_data.reserve(block_buf_size_);
 		char temp_fname[256];
 		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_gas_pos + coord);
 		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
@ -1526,7 +1478,6 @@ public:
 				shift[0] = 0.0;
 				shift[1] = 0.0;
 				shift[2] = 0.0;
 			}
 			shift[0] += 0.5 * h * omegac_ / omegam_;
 			shift[1] += 0.5 * h * omegac_ / omegam_;
@ -1568,9 +1519,10 @@ public:
 							temp_data.clear();
 							temp_data.push_back(xx[coord]);
 						}
 					}
-		}else{
+		}
 		else
 		{
 			std::ifstream ofg(fname_glass_baryon_.c_str(), std::ios::binary);
@ -1599,7 +1551,6 @@ public:
 			float lglass = glasshead.BoxSize;
 			blksize = sizeof(T_store) * glasshead.npart[1];
 			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
@ -1633,7 +1584,6 @@ public:
 				npartdone += npart2read;
 				nwritten += npart2read;
 			}
 			delete[] pos_tmp;
@ -1678,12 +1628,8 @@ public:
 	}
 };
-
+namespace
-
+{
 namespace{
 	output_plugin_creator_concrete<gadget2_2comp_output_plugin<float>> creator1("gadget2_2c");
 #ifndef SINGLE_PRECISION
 	output_plugin_creator_concrete<gadget2_2comp_output_plugin<double>> creator2("gadget2_2c_double");
 #endif
 }
--- a/src/plugins/output_gadget_tetmesh.cc
+++ b/src/plugins/output_gadget_tetmesh.cc
@ -1573,8 +1573,6 @@ public:
 namespace{
 	output_plugin_creator_concrete< gadget_tetmesh_output_plugin<float> > creator1("gadget_tetmesh");
 #ifndef SINGLE_PRECISION
 	output_plugin_creator_concrete< gadget_tetmesh_output_plugin<double> > creator2("gadget_tetmesh_double");
 #endif
 }
--- a/src/plugins/output_tipsy.cc
+++ b/src/plugins/output_tipsy.cc
@ -1107,9 +1107,7 @@ int tipsy_output_plugin<double>::xdr_dump( XDR *xdrs, double*p )
 namespace{
    output_plugin_creator_concrete< tipsy_output_plugin<float> > creator1("tipsy");
    //#ifndef SINGLE_PRECISION
    output_plugin_creator_concrete< tipsy_output_plugin<double> > creator2("tipsy_double");
    //#endif
 }
--- a/src/plugins/output_tipsy_resample.cc
+++ b/src/plugins/output_tipsy_resample.cc
@ -1396,9 +1396,7 @@ int tipsy_output_plugin_res < double >::xdr_dump (XDR * xdrs, double *p)
 namespace
 {
  output_plugin_creator_concrete< tipsy_output_plugin_res<float> >creator1 ("tipsy_resample");
 #ifndef SINGLE_PRECISION
  output_plugin_creator_concrete< tipsy_output_plugin_res<double> >creator2 ("tipsy_double_resample");
 #endif
 }
 #endif // defined(HAVE_TIRPC) 
--- a/src/plugins/random_music_wnoise_generator.cc
+++ b/src/plugins/random_music_wnoise_generator.cc
@ -40,8 +40,8 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
 			seedtable[(res - 1 - j) * res + (res - 1 - i)] = 0x7fffffff * gsl_rng_uniform(random_generator);
 	}
-	fftw_real *rnoise = new fftw_real[res * res * (res + 2)];
+	real_t *rnoise = new real_t[res * res * (res + 2)];
-	fftw_complex *knoise = reinterpret_cast<fftw_complex *>(rnoise);
+	complex_t *knoise = reinterpret_cast<complex_t *>(rnoise);
 	double fnorm = 1. / sqrt(res * res * res);
@ -126,26 +126,9 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
 	delete[] seedtable;
 	//... perform FT to real space
-
+	fftw_plan_t plan = FFTW_API(plan_dft_c2r_3d)(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
-#ifdef FFTW3
+	FFTW_API(execute)(plan);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
 	fftwf_plan plan = fftwf_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
 	fftwf_execute(plan);
 	fftwf_destroy_plan(plan);
 #else
 	fftw_plan plan = fftw_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
 	fftw_execute(plan);
 	fftw_destroy_plan(plan);
 #endif
 #else
 	rfftwnd_plan plan = rfftw3d_create_plan(res, res, res, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), plan, knoise, NULL);
 #else
 	rfftwnd_one_complex_to_real(plan, knoise, NULL);
 #endif
 	rfftwnd_destroy_plan(plan);
 #endif
 	// copy to array that holds the random numbers
@ -443,37 +426,25 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
  ncubes_ = 1;
  baseseed_ = -2;
-  if (sizeof(fftw_real) != sizeof(T))
+  if (sizeof(real_t) != sizeof(T))
  {
-    music::elog.Print("type mismatch with fftw_real in k-space averaging");
+    music::elog.Print("type mismatch with real_t in k-space averaging");
-    throw std::runtime_error("type mismatch with fftw_real in k-space averaging");
+    throw std::runtime_error("type mismatch with real_t in k-space averaging");
  }
-  fftw_real
+  real_t
-      *rfine = new fftw_real[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)],
+      *rfine = new real_t[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)],
-      *rcoarse = new fftw_real[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)];
+      *rcoarse = new real_t[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)];
-  fftw_complex
+  complex_t
-      *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse),
+      *ccoarse = reinterpret_cast<complex_t *>(rcoarse),
-      *cfine = reinterpret_cast<fftw_complex *>(rfine);
+      *cfine = reinterpret_cast<complex_t *>(rfine);
  int nx(rc.res_), ny(rc.res_), nz(rc.res_), nxc(res_), nyc(res_), nzc(res_);
 #ifdef FFTW3
 #ifdef SINGLE_PRECISION
  fftwf_plan
      pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
      ipc = fftwf_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
 #else
  fftw_plan
      pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
      ipc = fftw_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
 #endif
-#else
+  fftw_plan_t
-  rfftwnd_plan
+      pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-      pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
+      ipc = FFTW_API(plan_dft_c2r_3d)(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
      ipc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
 #pragma omp parallel for
  for (int i = 0; i < nx; i++)
@ -484,19 +455,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
        rfine[q] = rc(i, j, k);
      }
-#ifdef FFTW3
+  FFTW_API(execute)(pf);
 #ifdef SINGLE_PRECISION
  fftwf_execute(pf);
 #else
  fftw_execute(pf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
 #else
  rfftwnd_one_real_to_complex(pf, rfine, NULL);
 #endif
 #endif
  double fftnorm = 1.0 / ((double)nxc * (double)nyc * (double)nzc);
@ -532,19 +491,9 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
      }
  delete[] rfine;
-#ifdef FFTW3
+
-#ifdef SINGLE_PRECISION
+  FFTW_API(execute)(ipc);
-  fftwf_execute(ipc);
+
 #else
  fftw_execute(ipc);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
  rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
 #else
  rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
 #endif
 #endif
  rnums_.push_back(new Meshvar<T>(res_, 0, 0, 0));
  cubemap_[0] = 0; // map all to single array
@ -563,18 +512,8 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
  delete[] rcoarse;
-#ifdef FFTW3
+  FFTW_API(destroy_plan)(pf);
-#ifdef SINGLE_PRECISION
+  FFTW_API(destroy_plan)(ipc);
  fftwf_destroy_plan(pf);
  fftwf_destroy_plan(ipc);
 #else
  fftw_destroy_plan(pf);
  fftw_destroy_plan(ipc);
 #endif
 #else
  rfftwnd_destroy_plan(pf);
  rfftwnd_destroy_plan(ipc);
 #endif
  double rmean, rvar;
  rmean = sum / count;
@ -617,24 +556,12 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
  size_t nx = lx[0], ny = lx[1], nz = lx[2],
          nxc = lx[0] / 2, nyc = lx[1] / 2, nzc = lx[2] / 2;
-  fftw_real *rfine = new fftw_real[nx * ny * (nz + 2l)];
+  real_t *rfine = new real_t[nx * ny * (nz + 2l)];
-  fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+  complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
-#ifdef FFTW3
+  fftw_plan_t
-#ifdef SINGLE_PRECISION
+      pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-  fftwf_plan
+      ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
      pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
      ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
 #else
  fftw_plan
      pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
      ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
 #endif
 #else
  rfftwnd_plan
      pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
      ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
 #pragma omp parallel for
  for (int i = 0; i < (int)nx; i++)
@ -646,18 +573,10 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
      }
  // this->free_all_mem();	// temporarily free memory, allocate again later
-  fftw_real *rcoarse = new fftw_real[nxc * nyc * (nzc + 2)];
+  real_t *rcoarse = new real_t[nxc * nyc * (nzc + 2)];
-  fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+  complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
-#ifdef FFTW3
+  fftw_plan pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
 #ifdef SINGLE_PRECISION
  fftwf_plan pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
 #else
  fftw_plan pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
 #endif
 #else
  rfftwnd_plan pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
 #pragma omp parallel for
  for (int i = 0; i < (int)nxc; i++)
@ -667,23 +586,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
        size_t q = ((size_t)i * (size_t)nyc + (size_t)j) * (size_t)(nzc + 2) + (size_t)k;
        rcoarse[q] = rc(x0[0] / 2 + i, x0[1] / 2 + j, x0[2] / 2 + k);
      }
-#ifdef FFTW3
+
-#ifdef SINGLE_PRECISION
+  FFTW_API(execute)(pc);
-  fftwf_execute(pc);
+  FFTW_API(execute)(pf);
  fftwf_execute(pf);
 #else
  fftw_execute(pc);
  fftw_execute(pf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL);
  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
 #else
  rfftwnd_one_real_to_complex(pc, rcoarse, NULL);
  rfftwnd_one_real_to_complex(pf, rfine, NULL);
 #endif
 #endif
  double fftnorm = 1.0 / ((double)nx * (double)ny * (double)nz);
  double sqrt8 = sqrt(8.0);
@ -747,19 +652,8 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
          IM(cfine[q]) *= fftnorm;
        }
-#ifdef FFTW3
+
-#ifdef SINGLE_PRECISION
+    FFTW_API(execute)(ipf);
    fftwf_execute(ipf);
 #else
    fftw_execute(ipf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
    rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
 #else
    rfftwnd_one_complex_to_real(ipf, cfine, NULL);
 #endif
 #endif
 #pragma omp parallel for
    for (int i = 0; i < (int)nx; i++)
@ -772,21 +666,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
    delete[] rfine;
-#ifdef FFTW3
+    FFTW_API(destroy_plan)(pf);
-#ifdef SINGLE_PRECISION
+    FFTW_API(destroy_plan)(pc);
-    fftwf_destroy_plan(pf);
+    FFTW_API(destroy_plan)(ipf);
    fftwf_destroy_plan(pc);
    fftwf_destroy_plan(ipf);
 #else
    fftw_destroy_plan(pf);
    fftw_destroy_plan(pc);
    fftw_destroy_plan(ipf);
 #endif
 #else
    fftwnd_destroy_plan(pf);
    fftwnd_destroy_plan(pc);
    fftwnd_destroy_plan(ipf);
 #endif
 }
--- a/src/plugins/random_panphasia.cc
+++ b/src/plugins/random_panphasia.cc
@ -238,63 +238,25 @@ public:
 void RNG_panphasia::forward_transform_field(real_t *field, int nx, int ny, int nz)
 {
-  fftw_real *rfield = reinterpret_cast<fftw_real *>(field);
+  real_t *rfield = reinterpret_cast<real_t *>(field);
-  fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field);
+  complex_t *cfield = reinterpret_cast<complex_t *>(field);
-#ifdef FFTW3
+  fftw_plan_t pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
 #ifdef SINGLE_PRECISION
  fftwf_plan pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
 #else
  fftw_plan pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
 #endif
 #else
  rfftwnd_plan pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
-#ifdef FFTW3
+  FFTW_API(execute)(pf);
-#ifdef SINGLE_PRECISION
+
-  fftwf_execute(pf);
+  FFTW_API(destroy_plan)(pf);
 #else
  fftw_execute(pf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
  rfftwnd_threads_one_real_to_complex(num_threads_, pf, rfield, NULL);
 #else
  rfftwnd_one_real_to_complex(pf, rfield, NULL);
 #endif
 #endif
 }
 void RNG_panphasia::backward_transform_field(real_t *field, int nx, int ny, int nz)
 {
-  fftw_real *rfield = reinterpret_cast<fftw_real *>(field);
+  real_t *rfield = reinterpret_cast<real_t *>(field);
-  fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field);
+  complex_t *cfield = reinterpret_cast<complex_t *>(field);
-#ifdef FFTW3
+  fftw_plan_t ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
-#ifdef SINGLE_PRECISION
+  FFTW_API(execute)(ipf);
-  fftwf_plan ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
+  FFTW_API(destroy_plan(ipf));
 #else
  fftw_plan ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
 #endif
 #else
  rfftwnd_plan ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #endif
 #ifdef FFTW3
 #ifdef SINGLE_PRECISION
  fftwf_execute(ipf);
 #else
  fftw_execute(ipf);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
  rfftwnd_threads_one_complex_to_real(num_threads_, ipf, cfield, NULL);
 #else
  rfftwnd_one_complex_to_real(ipf, cfield, NULL);
 #endif
 #endif
 }
 #include <sys/time.h>
@ -309,8 +271,8 @@ inline double get_wtime(void)
 void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
 {
-  fftw_real *pr0, *pr1, *pr2, *pr3, *pr4;
+  real_t *pr0, *pr1, *pr2, *pr3, *pr4;
-  fftw_complex *pc0, *pc1, *pc2, *pc3, *pc4;
+  complex_t *pc0, *pc1, *pc2, *pc3, *pc4;
  // determine resolution and offset so that we can do proper resampling
  int ileft[3], ileft_corner[3], nx[3], nxremap[3];
@ -379,17 +341,17 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
  size_t ngp = nxremap[0] * nxremap[1] * (nxremap[2] + 2);
-  pr0 = new fftw_real[ngp];
+  pr0 = new real_t[ngp];
-  pr1 = new fftw_real[ngp];
+  pr1 = new real_t[ngp];
-  pr2 = new fftw_real[ngp];
+  pr2 = new real_t[ngp];
-  pr3 = new fftw_real[ngp];
+  pr3 = new real_t[ngp];
-  pr4 = new fftw_real[ngp];
+  pr4 = new real_t[ngp];
-  pc0 = reinterpret_cast<fftw_complex *>(pr0);
+  pc0 = reinterpret_cast<complex_t *>(pr0);
-  pc1 = reinterpret_cast<fftw_complex *>(pr1);
+  pc1 = reinterpret_cast<complex_t *>(pr1);
-  pc2 = reinterpret_cast<fftw_complex *>(pr2);
+  pc2 = reinterpret_cast<complex_t *>(pr2);
-  pc3 = reinterpret_cast<fftw_complex *>(pr3);
+  pc3 = reinterpret_cast<complex_t *>(pr3);
-  pc4 = reinterpret_cast<fftw_complex *>(pr4);
+  pc4 = reinterpret_cast<complex_t *>(pr4);
  music::ilog.Print("calculating PANPHASIA random numbers for level %d...", level);
  clear_panphasia_thread_states();
@ -782,7 +744,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
  {
    music::ulog.Print("Remapping fields from dimension %d -> %d", nxremap[0], nx_m[0]);
-    memset(pr1, 0, ngp * sizeof(fftw_real));
+    memset(pr1, 0, ngp * sizeof(real_t));
    #pragma omp parallel for
    for (int i = 0; i < nxremap[0]; i++)
@ -812,7 +774,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
          }
        }
-    memcpy(pr0, pr1, ngp * sizeof(fftw_real));
+    memcpy(pr0, pr1, ngp * sizeof(real_t));
  }
  // if (level == 9)
--- a/src/poisson.cc
+++ b/src/poisson.cc
@ -10,8 +10,8 @@
 /****** ABSTRACT FACTORY PATTERN IMPLEMENTATION *******/
-#include "poisson.hh"
+#include <poisson.hh>
-#include "Numerics.hh"
+#include <Numerics.hh>
 std::map<std::string, poisson_plugin_creator *> &
 get_poisson_plugin_map()
@ -40,23 +40,18 @@ void print_poisson_plugins()
 /****** CALL IMPLEMENTATIONS OF POISSON SOLVER CLASSES ******/
-#include "mg_solver.hh"
+#include <mg_solver.hh>
-#include "fd_schemes.hh"
+#include <fd_schemes.hh>
-#ifdef SINGLE_PRECISION
+
-typedef multigrid::solver<stencil_7P<float>, interp_O3_fluxcorr, mg_straight, float> poisson_solver_O2;
+typedef multigrid::solver<stencil_7P, interp_O3_fluxcorr, mg_straight> poisson_solver_O2;
-typedef multigrid::solver<stencil_13P<float>, interp_O5_fluxcorr, mg_straight, float> poisson_solver_O4;
+typedef multigrid::solver<stencil_13P, interp_O5_fluxcorr, mg_straight> poisson_solver_O4;
-typedef multigrid::solver<stencil_19P<float>, interp_O7_fluxcorr, mg_straight, float> poisson_solver_O6;
+typedef multigrid::solver<stencil_19P, interp_O7_fluxcorr, mg_straight> poisson_solver_O6;
 #else
 typedef multigrid::solver<stencil_7P<double>, interp_O3_fluxcorr, mg_straight, double> poisson_solver_O2;
 typedef multigrid::solver<stencil_13P<double>, interp_O5_fluxcorr, mg_straight, double> poisson_solver_O4;
 typedef multigrid::solver<stencil_19P<double>, interp_O7_fluxcorr, mg_straight, double> poisson_solver_O6;
 #endif
 /**************************************************************************************/
 /**************************************************************************************/
-double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
+real_t multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 {
 	music::ulog.Print("Initializing multi-grid Poisson solver...");
@ -68,11 +63,11 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 		std::cout << " - Invoking multi-grid Poisson solver..." << std::endl;
 	}
-	double acc = 1e-5, err;
+	real_t acc = 1e-5, err;
 	std::string ps_smoother_name;
 	unsigned ps_presmooth, ps_postsmooth, order;
-	acc = cf_.get_value_safe<double>("poisson", "accuracy", acc);
+	acc = cf_.get_value_safe<real_t>("poisson", "accuracy", acc);
 	ps_presmooth = cf_.get_value_safe<unsigned>("poisson", "pre_smooth", 3);
 	ps_postsmooth = cf_.get_value_safe<unsigned>("poisson", "post_smooth", 3);
 	ps_smoother_name = cf_.get_value_safe<std::string>("poisson", "smoother", "gs");
@ -102,12 +97,12 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 							<< "            reverting to \'gs\' (Gauss-Seidel)" << std::endl;
 	}
-	double tstart, tend;
+	real_t tstart, tend;
 #ifndef SINGLETHREAD_FFTW
 	tstart = omp_get_wtime();
 #else
-	tstart = (double)clock() / CLOCKS_PER_SEC;
+	tstart = (real_t)clock() / CLOCKS_PER_SEC;
 #endif
 	//----- run Poisson solver -----//
@ -142,7 +137,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	if (verbosity > 1)
 		std::cout << " - Poisson solver took " << tend - tstart << "s with " << omp_get_max_threads() << " threads." << std::endl;
 #else
-	tend = (double)clock() / CLOCKS_PER_SEC;
+	tend = (real_t)clock() / CLOCKS_PER_SEC;
 	if (verbosity > 1)
 		std::cout << " - Poisson solver took " << tend - tstart << "s." << std::endl;
@ -151,7 +146,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	return err;
 }
-double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 	Du = u;
@ -176,7 +171,7 @@ double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hiera
 	return 0.0;
 }
-double multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 	// Du = u;
@ -207,7 +202,7 @@ void multigrid_poisson_plugin::implementation::gradient_O2(int dir, grid_hierarc
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		if (dir == 0)
@ -241,7 +236,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O2(int dir, grid_hie
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		if (dir == 0)
@ -275,7 +270,7 @@ void multigrid_poisson_plugin::implementation::gradient_O4(int dir, grid_hierarc
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		h /= 12.0;
@ -311,7 +306,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O4(int dir, grid_hie
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		h /= 12.0;
@ -347,7 +342,7 @@ void multigrid_poisson_plugin::implementation::gradient_O6(int dir, grid_hierarc
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		h /= 60.;
@ -385,7 +380,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 		h /= 60.;
@ -421,7 +416,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
 /**************************************************************************************/
 #include "general.hh"
-double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
+real_t fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 {
 	music::ulog.Print("Entering k-space Poisson solver...");
@ -446,8 +441,8 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	nzp = 2 * (nz / 2 + 1);
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp];
+	real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@ -461,37 +456,14 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	//... perform FFT and Poisson solve................................
 	music::ulog.Print("Performing forward transform.");
-#ifdef FFTW3
+	fftw_plan_t
-#ifdef SINGLE_PRECISION
+			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-	fftwf_plan
+			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
 			plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
 			iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
-	fftwf_execute(plan);
+	FFTW_API(execute)(plan);
 #else
 	fftw_plan
 			plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
 			iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
-	fftw_execute(plan);
+	real_t kfac = 2.0 * M_PI;
-#endif
+	real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
 #else
 	rfftwnd_plan
 			plan = rfftw3d_create_plan(nx, ny, nz,
 																 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
 			iplan = rfftw3d_create_plan(nx, ny, nz,
 																	FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
 #else
 	rfftwnd_one_real_to_complex(plan, data, NULL);
 #endif
 #endif
 	double kfac = 2.0 * M_PI;
 	double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@ -504,11 +476,11 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 				int jj = j;
 				if (jj > ny / 2)
 					jj -= ny;
-				double ki = (double)ii;
+				real_t ki = (real_t)ii;
-				double kj = (double)jj;
+				real_t kj = (real_t)jj;
-				double kk = (double)k;
+				real_t kk = (real_t)k;
-				double kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk);
+				real_t kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk);
 				size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
@ -521,26 +493,9 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	music::ulog.Print("Performing backward transform.");
-#ifdef FFTW3
+	FFTW_API(execute)(iplan);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
-	fftwf_execute(iplan);
+	FFTW_API(destroy_plan)(iplan);
 	fftwf_destroy_plan(plan);
 	fftwf_destroy_plan(iplan);
 #else
 	fftw_execute(iplan);
 	fftw_destroy_plan(plan);
 	fftw_destroy_plan(iplan);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
 #else
 	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
 #endif
 	rfftwnd_destroy_plan(plan);
 	rfftwnd_destroy_plan(iplan);
 #endif
 //... copy data ..........................................
 #pragma omp parallel for
@ -596,7 +551,7 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	return 0.0;
 }
-double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 	music::ulog.Print("Computing a gradient in k-space...\n");
@ -612,8 +567,8 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 	nzp = 2 * (nz / 2 + 1);
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp];
+	real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@ -625,38 +580,14 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 			}
 			//... perform FFT and Poisson solve................................
 	fftw_plan_t
 			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
 			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
-#ifdef FFTW3
+	FFTW_API(execute)(plan);
 #ifdef SINGLE_PRECISION
 	fftwf_plan
 			plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
 			iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
-	fftwf_execute(plan);
+	real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
-#else
+	real_t kfac = 2.0 * M_PI;
 	fftw_plan
 			plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
 			iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
 	fftw_execute(plan);
 #endif
 #else
 	rfftwnd_plan
 			plan = rfftw3d_create_plan(nx, ny, nz,
 																 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
 			iplan = rfftw3d_create_plan(nx, ny, nz,
 																	FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
 #else
 	rfftwnd_one_real_to_complex(plan, data, NULL);
 #endif
 #endif
 	double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
 	double kfac = 2.0 * M_PI;
 	bool do_glass = cf_.get_value_safe<bool>("output", "glass", false);
 	bool deconvolve_cic = do_glass | cf_.get_value_safe<bool>("output", "glass_cicdeconvolve", false);
@ -671,98 +602,54 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 			{
 				size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
 				int ii = i;
-				if (ii > nx / 2)
+				if (ii > nx / 2) ii -= nx;
 					ii -= nx;
 				int jj = j;
-				if (jj > ny / 2)
+				if (jj > ny / 2) jj -= ny;
 					jj -= ny;
 				const double ki = (double)ii;
 				const double kj = (double)jj;
 				const double kk = (double)k;
-				const double kkdir[3] = {kfac * ki, kfac * kj, kfac * kk};
+				const real_t ki{(real_t)ii};
-				const double kdir = kkdir[dir];
+				const real_t kj{(real_t)jj};
 				const real_t kk{(real_t)k};
 				const real_t kkdir[3] = {kfac * ki, kfac * kj, kfac * kk};
 				const real_t kdir = kkdir[dir];
-				double re = RE(cdata[idx]);
+				real_t re = RE(cdata[idx]);
-				double im = IM(cdata[idx]);
+				real_t im = IM(cdata[idx]);
 				RE(cdata[idx]) = fac * im * kdir;
 				IM(cdata[idx]) = -fac * re * kdir;
 #ifdef FFTW3
 				if (deconvolve_cic)
 				{
-					double dfx, dfy, dfz;
+					real_t dfx, dfy, dfz;
-					dfx = M_PI * ki / (double)nx;
+					dfx = M_PI * ki / (real_t)nx;
-					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
+					dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
-					dfy = M_PI * kj / (double)ny;
+					dfy = M_PI * kj / (real_t)ny;
-					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
+					dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
-					dfz = M_PI * kk / (double)nz;
+					dfz = M_PI * kk / (real_t)nz;
-					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
+					dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
 					dfx = 1.0 / (dfx * dfy * dfz);
 					dfx = dfx * dfx;
-					cdata[idx][0] *= dfx;
+					RE(cdata[idx]) *= dfx;
-					cdata[idx][1] *= dfx;
+					IM(cdata[idx]) *= dfx;
 				}
 #else
 				if (deconvolve_cic)
 				{
 					double dfx, dfy, dfz;
 					dfx = M_PI * ki / (double)nx;
 					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
 					dfy = M_PI * kj / (double)ny;
 					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
 					dfz = M_PI * kk / (double)nz;
 					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
 					dfx = 1.0 / (dfx * dfy * dfz);
 					dfx = dfx * dfx;
 					cdata[idx].re *= dfx;
 					cdata[idx].im *= dfx;
 				}
 #endif
 				if( (dir == 0 && i==nx/2) || (dir == 1 && j==ny/2) || (dir == 2 && k==nz/2) )
 				{
-#ifdef FFTW3
+					RE(cdata[idx]) = 0.0;
-					cdata[idx][0] = 0.0;
+					IM(cdata[idx]) = 0.0;
 					cdata[idx][1] = 0.0;
 #else
 					cdata[idx].re = 0.0;
 					cdata[idx].im = 0.0;
 #endif
 				}
 			}
 	RE(cdata[0]) = 0.0;
 	IM(cdata[0]) = 0.0;
-#ifdef FFTW3
+	FFTW_API(execute)(iplan);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
-	fftwf_execute(iplan);
+	FFTW_API(destroy_plan)(iplan);
 	fftwf_destroy_plan(plan);
 	fftwf_destroy_plan(iplan);
 #else
 	fftw_execute(iplan);
 	fftw_destroy_plan(plan);
 	fftw_destroy_plan(iplan);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
 #else
 	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
 #endif
 	rfftwnd_destroy_plan(plan);
 	rfftwnd_destroy_plan(iplan);
 #endif
 	//... copy data ..........................................
-	double dmax = 0.0;
+	real_t dmax = 0.0;
 	for (int i = 0; i < nx; ++i)
 		for (int j = 0; j < ny; ++j)
 			for (int k = 0; k < nz; ++k)
@ -784,35 +671,35 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 /**************************************************************************************/
 template <int order>
-double poisson_hybrid_kernel(int idir, int i, int j, int k, int n)
+real_t poisson_hybrid_kernel(int idir, int i, int j, int k, int n)
 {
 	return 1.0;
 }
 template <>
-inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
 {
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
-	double
+	real_t
-			ki(M_PI * (double)i / (double)n),
+			ki(M_PI * (real_t)i / (real_t)n),
-			kj(M_PI * (double)j / (double)n),
+			kj(M_PI * (real_t)j / (real_t)n),
-			kk(M_PI * (double)k / (double)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 	if (idir == 0)
-		grad = sin(ki);
+		grad = std::sin(ki);
 	else if (idir == 1)
-		grad = sin(kj);
+		grad = std::sin(kj);
 	else
-		grad = sin(kk);
+		grad = std::sin(kk);
-	laplace = 2.0 * ((-cos(ki) + 1.0) + (-cos(kj) + 1.0) + (-cos(kk) + 1.0));
+	laplace = 2.0 * ((-std::cos(ki) + 1.0) + (-std::cos(kj) + 1.0) + (-std::cos(kk) + 1.0));
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@ -824,30 +711,30 @@ inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
 }
 template <>
-inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
 {
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
-	double
+	real_t
-			ki(M_PI * (double)i / (double)n),
+			ki(M_PI * (real_t)i / (real_t)n),
-			kj(M_PI * (double)j / (double)n),
+			kj(M_PI * (real_t)j / (real_t)n),
-			kk(M_PI * (double)k / (double)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 	if (idir == 0)
-		grad = 0.166666666667 * (-sin(2. * ki) + 8. * sin(ki));
+		grad = 0.166666666667 * (-std::sin(2. * ki) + 8. * std::sin(ki));
 	else if (idir == 1)
-		grad = 0.166666666667 * (-sin(2. * kj) + 8. * sin(kj));
+		grad = 0.166666666667 * (-std::sin(2. * kj) + 8. * std::sin(kj));
 	else if (idir == 2)
-		grad = 0.166666666667 * (-sin(2. * kk) + 8. * sin(kk));
+		grad = 0.166666666667 * (-std::sin(2. * kk) + 8. * std::sin(kk));
-	laplace = 0.1666666667 * ((cos(2 * ki) - 16. * cos(ki) + 15.) + (cos(2 * kj) - 16. * cos(kj) + 15.) + (cos(2 * kk) - 16. * cos(kk) + 15.));
+	laplace = 0.1666666667 * ((std::cos(2 * ki) - 16. * std::cos(ki) + 15.) + (std::cos(2 * kj) - 16. * std::cos(kj) + 15.) + (std::cos(2 * kk) - 16. * std::cos(kk) + 15.));
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@ -859,29 +746,29 @@ inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
 }
 template <>
-inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
 {
-	double
+	real_t
-			ki(M_PI * (double)i / (double)n),
+			ki(M_PI * (real_t)i / (real_t)n),
-			kj(M_PI * (double)j / (double)n),
+			kj(M_PI * (real_t)j / (real_t)n),
-			kk(M_PI * (double)k / (double)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 	if (idir == 0)
-		grad = 0.0333333333333 * (sin(3. * ki) - 9. * sin(2. * ki) + 45. * sin(ki));
+		grad = 0.0333333333333 * (std::sin(3. * ki) - 9. * std::sin(2. * ki) + 45. * std::sin(ki));
 	else if (idir == 1)
-		grad = 0.0333333333333 * (sin(3. * kj) - 9. * sin(2. * kj) + 45. * sin(kj));
+		grad = 0.0333333333333 * (std::sin(3. * kj) - 9. * std::sin(2. * kj) + 45. * std::sin(kj));
 	else if (idir == 2)
-		grad = 0.0333333333333 * (sin(3. * kk) - 9. * sin(2. * kk) + 45. * sin(kk));
+		grad = 0.0333333333333 * (std::sin(3. * kk) - 9. * std::sin(2. * kk) + 45. * std::sin(kk));
-	laplace = 0.01111111111111 * ((-2. * cos(3.0 * ki) + 27. * cos(2. * ki) - 270. * cos(ki) + 245.) + (-2. * cos(3.0 * kj) + 27. * cos(2. * kj) - 270. * cos(kj) + 245.) + (-2. * cos(3.0 * kk) + 27. * cos(2. * kk) - 270. * cos(kk) + 245.));
+	laplace = 0.01111111111111 * ((-2. * std::cos(3.0 * ki) + 27. * std::cos(2. * ki) - 270. * std::cos(ki) + 245.) + (-2. * std::cos(3.0 * kj) + 27. * std::cos(2. * kj) - 270. * std::cos(kj) + 245.) + (-2. * std::cos(3.0 * kk) + 27. * std::cos(2. * kk) - 270. * std::cos(kk) + 245.));
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@ -896,42 +783,19 @@ inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
 }
 template <int order>
-void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic)
+void do_poisson_hybrid(real_t *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic)
 {
-	double fftnorm = 1.0 / ((double)nxp * (double)nyp * (double)nzp);
+	real_t fftnorm = 1.0 / ((real_t)nxp * (real_t)nyp * (real_t)nzp);
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 	if (deconvolve_cic)
 		music::ilog.Print("CIC deconvolution step is enabled.");
-#ifdef FFTW3
+	fftw_plan_t iplan, plan;
-#ifdef SINGLE_PRECISION
+	plan = FFTW_API(plan_dft_r2c_3d)(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
-	fftwf_plan iplan, plan;
+	iplan = FFTW_API(plan_dft_c2r_3d)(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
-	plan = fftwf_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
+	FFTW_API(execute)(plan);
 	iplan = fftwf_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
 	fftwf_execute(plan);
 #else
 	fftw_plan iplan, plan;
 	plan = fftw_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
 	iplan = fftw_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
 	fftw_execute(plan);
 #endif
 #else
 	rfftwnd_plan iplan, plan;
 	plan = rfftw3d_create_plan(nxp, nyp, nzp,
 														 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
 	iplan = rfftw3d_create_plan(nxp, nyp, nzp,
 															FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
 #else
 	rfftwnd_one_real_to_complex(plan, data, NULL);
 #endif
 #endif
 #pragma omp parallel for
 	for (int i = 0; i < nxp; ++i)
@ -948,22 +812,22 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
 					kj -= nyp;
 				//... apply hybrid correction
-				double dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2);
+				real_t dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2);
-				fftw_real re = RE(cdata[ii]), im = IM(cdata[ii]);
+				real_t re = RE(cdata[ii]), im = IM(cdata[ii]);
 				RE(cdata[ii]) = -im * dk * fftnorm;
 				IM(cdata[ii]) = re * dk * fftnorm;
 				if (deconvolve_cic)
 				{
-					double dfx, dfy, dfz;
+					real_t dfx, dfy, dfz;
-					dfx = M_PI * ki / (double)nxp;
+					dfx = M_PI * ki / (real_t)nxp;
-					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
+					dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
-					dfy = M_PI * kj / (double)nyp;
+					dfy = M_PI * kj / (real_t)nyp;
-					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
+					dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
-					dfz = M_PI * kk / (double)nzp;
+					dfz = M_PI * kk / (real_t)nzp;
-					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
+					dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
 					dfx = 1.0 / (dfx * dfy * dfz);
 					dfx = dfx * dfx;
@ -981,26 +845,9 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
 	RE(cdata[0]) = 0.0;
 	IM(cdata[0]) = 0.0;
-#ifdef FFTW3
+	FFTW_API(execute)(iplan);
-#ifdef SINGLE_PRECISION
+	FFTW_API(destroy_plan)(plan);
-	fftwf_execute(iplan);
+	FFTW_API(destroy_plan)(iplan);
 	fftwf_destroy_plan(plan);
 	fftwf_destroy_plan(iplan);
 #else
 	fftw_execute(iplan);
 	fftw_destroy_plan(plan);
 	fftw_destroy_plan(iplan);
 #endif
 #else
 #ifndef SINGLETHREAD_FFTW
 	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
 #else
 	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
 #endif
 	rfftwnd_destroy_plan(plan);
 	rfftwnd_destroy_plan(iplan);
 #endif
 }
 template <typename T>
@ -1008,7 +855,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 {
 	int nx = f.size(0), ny = f.size(1), nz = f.size(2), nxp, nyp, nzp;
-	fftw_real *data;
+	real_t *data;
 	int xo = 0, yo = 0, zo = 0;
 	int nmax = std::max(nx, std::max(ny, nz));
@ -1018,12 +865,12 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 	if (!periodic)
 	{
-		nxp = nmax + 2 * boundary; // 2*nmax;
+		nxp = nmax + 2 * boundary; 
-		nyp = nmax + 2 * boundary; // 2*nmax;
+		nyp = nmax + 2 * boundary; 
-		nzp = nmax + 2 * boundary; // 2*nmax;
+		nzp = nmax + 2 * boundary; 
-		xo = boundary;						 // nmax/2;
+		xo = boundary;						 
-		yo = boundary;						 // nmax/2;
+		yo = boundary;						 
-		zo = boundary;						 // nmax/2;
+		zo = boundary;						 
 	}
 	else
 	{
@ -1032,17 +879,11 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 		nzp = nmax;
 	}
-	data = new fftw_real[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)];
+	data = new real_t[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)];
 	if (idir == 0)
 		std::cout << "   - Performing hybrid Poisson step... (" << nxp << ", " << nyp << ", " << nzp << ")\n";
 		// size_t N = (size_t)nxp*(size_t)nyp*2*((size_t)nzp/2+1);
 		// #pragma omp parallel for
 		// for( size_t i=0; i<N; ++i )
 		//	data[i]=0.0;
 #pragma omp parallel for
 	for (int i = 0; i < nxp; ++i)
 		for (int j = 0; j < nyp; ++j)
@ -1097,7 +938,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 /**************************************************************************************/
 /**************************************************************************************/
-template void poisson_hybrid<MeshvarBnd<double>>(MeshvarBnd<double> &f, int idir, int order, bool periodic, bool deconvolve_cic);
+template void poisson_hybrid<MeshvarBnd<real_t>>(MeshvarBnd<real_t> &f, int idir, int order, bool periodic, bool deconvolve_cic);
 template void poisson_hybrid<MeshvarBnd<float>>(MeshvarBnd<float> &f, int idir, int order, bool periodic, bool deconvolve_cic);
 namespace
--- a/src/solver.hh
+++ b/src/solver.hh
--- a/src/system_stat.hh
+++ b/src/system_stat.hh
@ -0,0 +1,194 @@
 // This file is part of MUSIC2
 // A software package to generate ICs for cosmological simulations
 // Copyright (C) 2020-23 by Oliver Hahn
 // 
 // MUSIC2 is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 // 
 // MUSIC2 is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // 
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #pragma once
 #ifdef __APPLE__
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <mach/mach.h>
 #include <mach/vm_statistics.h>
 #include <mach/mach_types.h>
 #include <mach/mach_init.h>
 #include <mach/mach_host.h>
 #include <unistd.h>
 #elif __linux__
 #include <cstring>
 #include <cstdio>
 #include <strings.h>
 #endif
 #include <string>
 namespace SystemStat
 {
 class Cpu
 {
 public:
    Cpu() {}
    std::string get_CPUstring() const
    {
 #ifdef __APPLE__
        char buffer[1024];
        size_t size = sizeof(buffer);
        if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, NULL, 0) < 0)
        {
            return "";
        }
        return std::string(buffer);
 #elif __linux__
        std::string str = "";
        FILE *cpuinfo = fopen("/proc/cpuinfo", "rb");
        char *arg = 0;
        size_t size = 0;
        while (getdelim(&arg, &size, '\n', cpuinfo) != -1)
        {
            if (strncmp(arg, "model name", 10) == 0)
            {
                str = std::string(arg + 13);
                break;
            }
        }
        free(arg);
        fclose(cpuinfo);
        //remove newline characters from string
        str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
        return str;
 #endif
    }
 };
 class Memory
 {
 private:
    size_t total;
    size_t avail;
    size_t used;
 public:
    Memory()
        : total(0), avail(0), used(0)
    {
        this->get_statistics();
    }
    size_t get_TotalMem() const { return this->total; }
    size_t get_AvailMem() const { return this->avail; }
    size_t get_UsedMem() const { return this->used; }
    void update() { this->get_statistics(); }
 protected:
    int get_statistics(void)
    {
 #ifdef __APPLE__
        int64_t pagesize = int64_t(getpagesize());
        int mib[2] = {CTL_HW, HW_MEMSIZE};
        size_t length = sizeof(size_t);
        sysctl(mib, 2, &this->total, &length, nullptr, 0);
        vm_statistics64 vmstat;
        natural_t mcount = HOST_VM_INFO64_COUNT;
        if (host_statistics64(mach_host_self(), HOST_VM_INFO64, reinterpret_cast<host_info64_t>(&vmstat), &mcount) == KERN_SUCCESS)
        {
 #if 1 // count inactive as available
            this->avail = (int64_t(vmstat.free_count) +
                           int64_t(vmstat.inactive_count)) *
                          pagesize;
            this->used = (int64_t(vmstat.active_count) +
                          int64_t(vmstat.wire_count)) *
                         pagesize;
 #else // count inactive as unavailable
            this->avail = int64_t(vmstat.free_count) * pagesize;
            this->used = (int64_t(vmstat.active_count) +
                          int64_t(vmstat.inactive_count) +
                          int64_t(vmstat.wire_count)) *
                         pagesize;
 #endif
        }
 #elif __linux__
        FILE *fd;
        char buf[1024];
        if ((fd = fopen("/proc/meminfo", "r")))
        {
            while (1)
            {
                if (fgets(buf, 500, fd) != buf)
                    break;
                if (bcmp(buf, "MemTotal", 8) == 0)
                {
                    this->total = atoll(buf + 10) * 1024; // in Mb
                }
                if (strncmp(buf, "Committed_AS", 12) == 0)
                {
                    this->used = atoll(buf + 14) * 1024; // in Mb
                }
                // if(strncmp(buf, "SwapTotal", 9) == 0)
                // {
                //     *SwapTotal = atoll(buf + 11);
                // }
                // if(strncmp(buf, "SwapFree", 8) == 0)
                // {
                //     *SwapFree = atoll(buf + 10);
                // }
            }
            fclose(fd);
        }
        this->avail = this->total - this->used;
 #endif
        return 0;
    }
 };
 #include <cstdlib>
 #include <string>
 #include <sys/utsname.h>
 class Kernel
 {
 public:
    struct info_t
    {
        std::string kernel;
        std::uint32_t major;
        std::uint32_t minor;
        std::uint32_t patch;
        std::uint32_t build_number;
    };
    Kernel() {}
    info_t get_kernel_info()
    {
        utsname uts;
        uname(&uts);
        char *marker = uts.release;
        const std::uint32_t major = std::strtoul(marker, &marker, 10);
        const std::uint32_t minor = std::strtoul(marker + 1, &marker, 10);
        const std::uint32_t patch = std::strtoul(marker + 1, &marker, 10);
        const std::uint32_t build_number = std::strtoul(marker + 1, nullptr, 10);
        std::string kernel = uts.sysname;
        return {kernel, major, minor, patch, build_number};
    }
 };
 } /* namespace SystemStat */
--- a/src/transfer_function.hh
+++ b/src/transfer_function.hh
@ -294,10 +294,10 @@ protected:
 		double fftnorm = 1.0/N;
-		fftw_complex *in, *out;
+		complex_t *in, *out;
-		in = new fftw_complex[N];
+		in = new complex_t[N];
-		out = new fftw_complex[N];
+		out = new complex_t[N];
 		//... perform anti-ringing correction from Hamilton (2000)
 		k0r0 = krgood( mu, q, dlnr, k0r0 );
@ -341,24 +341,10 @@ protected:
 		}
 		ofsk.close();
-#ifdef FFTW3
+		fftw_plan_t p,ip;
-	#ifdef SINGLE_PRECISION
+		p = FFTW_API(plan_dft_1d)(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
-		fftwf_plan p,ip;
+		ip = FFTW_API(plan_dft_1d)(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
-		p = fftwf_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+		FFTW_API(execute)(p);
 		ip = fftwf_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
 		fftwf_execute(p);
 	#else
 		fftw_plan p,ip;
 		p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
 		ip = fftw_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
 		fftw_execute(p);
 	#endif
 #else
 		fftw_plan p,ip;
 		p = fftw_create_plan(N, FFTW_FORWARD, FFTW_ESTIMATE);
 		ip = fftw_create_plan(N, FFTW_BACKWARD, FFTW_ESTIMATE);
 		fftw_one(p, in, out);
 #endif
 		//... compute the Hankel transform by convolution with the Bessel function
 		for( unsigned i=0; i<N; ++i )
@ -430,15 +416,7 @@ protected:
 		}
-#ifdef FFTW3
+		FFTW_API(execute)(ip);
 	#ifdef SINGLE_PRECISION
 		fftwf_execute(ip);
 	#else
 		fftw_execute(ip);
 	#endif
 #else
 		fftw_one(ip, out, in);
 #endif
 		rr.assign(N,0.0);
 		TT.assign(N,0.0);
@ -482,13 +460,8 @@ protected:
 		delete[] in;
 		delete[] out;
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
+		FFTW_API(destroy_plan)(p);
-		fftwf_destroy_plan(p);
+		FFTW_API(destroy_plan)(ip);
 		fftwf_destroy_plan(ip);
 #else
 		fftw_destroy_plan(p);
 		fftw_destroy_plan(ip);
 #endif
 	}
 	std::vector<real_t> m_xtable,m_ytable,m_dytable;
 	double m_xmin, m_xmax, m_dx, m_rdx;