From 6b32a7d6e46e839199f4d846a08c5325d22d7471 Mon Sep 17 00:00:00 2001
From: Oliver Hahn <oliverjhahn@gmail.com>
Date: Tue, 14 Feb 2023 17:12:19 -0800
Subject: [PATCH] WIP refactoring. Got rid of FFTW2, some of the old
 single/double precision templating,...

---
 CMakeLists.txt                               |  129 +-
 FindFFTW3.cmake                              |   11 +
 src/TransferFunction.hh                      |    2 +-
 src/cmake_config.hh.in                       |   25 +
 src/constraints.cc                           |    4 +-
 src/constraints.hh                           |  523 ++--
 src/convolution_kernel.cc                    |   88 +-
 src/convolution_kernel.hh                    |    1 -
 src/cosmology.cc                             |  815 +++----
 src/densities.cc                             |  183 +-
 src/fd_schemes.hh                            |   47 +-
 src/general.hh                               |   98 +-
 src/main.cc                                  |  143 +-
 src/mg_interp.hh                             |  180 +-
 src/mg_solver.hh                             |  957 ++++----
 src/plugins/output_enzo.cc                   |   12 +-
 src/plugins/output_gadget2.cc                |    2 -
 src/plugins/output_gadget2_2comp.cc          | 2272 +++++++++---------
 src/plugins/output_gadget_tetmesh.cc         |    2 -
 src/plugins/output_tipsy.cc                  |    2 -
 src/plugins/output_tipsy_resample.cc         |    2 -
 src/plugins/random_music_wnoise_generator.cc |  196 +-
 src/plugins/random_panphasia.cc              |   88 +-
 src/poisson.cc                               |  425 +---
 src/solver.hh                                | 1602 ++++++------
 src/system_stat.hh                           |  194 ++
 src/transfer_function.hh                     |   51 +-
 27 files changed, 3654 insertions(+), 4400 deletions(-)
 create mode 100644 src/cmake_config.hh.in
 create mode 100644 src/system_stat.hh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10485a1..bd95567 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,21 @@
-cmake_minimum_required(VERSION 3.9)
+# This file is part of MUSIC2
+# A software package to generate ICs for cosmological simulations
+# Copyright (C) 2023 by Oliver Hahn
+# 
+# monofonIC is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# monofonIC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+cmake_minimum_required(VERSION 3.11)
 set(PRGNAME MUSIC)
 project(MUSIC)
 
@@ -27,12 +44,10 @@ mark_as_advanced(CMAKE_CXX_FLAGS_DEBUGSANADD CMAKE_CXX_FLAGS_DEBUGSANUNDEF)
 mark_as_advanced(CMAKE_C_FLAGS_DEBUGSANADD CMAKE_C_FLAGS_DEBUGSANUNDEF)
 mark_as_advanced(CMAKE_EXECUTABLE_FORMAT CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT)
 
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -Wall -pedantic -DCMAKE_BUILD")
 find_package(PkgConfig REQUIRED)
 
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}")
 
-option(MUSIC_ENABLE_SINGLE_PRECISION "Enable Single Precision Mode" OFF)
 
 ########################################################################################################################
 # OpenMP
@@ -52,12 +67,44 @@ find_package(Threads REQUIRED)
 if(POLICY CMP0074)
     cmake_policy(SET CMP0074 NEW)
 endif()
-find_package(FFTW3 COMPONENTS SINGLE DOUBLE THREADS)
+if(ENABLE_MPI)
+  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS MPI)
+else()
+  find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS)
+endif(ENABLE_MPI)
+mark_as_advanced(FFTW3_SINGLE_MPI_LIBRARY FFTW3_SINGLE_OPENMP_LIBRARY FFTW3_SINGLE_SERIAL_LIBRARY FFTW3_SINGLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_DOUBLE_MPI_LIBRARY FFTW3_DOUBLE_OPENMP_LIBRARY FFTW3_DOUBLE_SERIAL_LIBRARY FFTW3_DOUBLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_LONGDOUBLE_MPI_LIBRARY FFTW3_LONGDOUBLE_OPENMP_LIBRARY FFTW3_LONGDOUBLE_SERIAL_LIBRARY FFTW3_LONGDOUBLE_THREADS_LIBRARY)
+mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_MPI_INCLUDE_DIR)
+mark_as_advanced(pkgcfg_lib_PC_FFTW_fftw3)
 
 ########################################################################################################################
 # TIRPC, needed only for Tipsy format
 find_package(TIRPC)
 
+########################################################################################################################
+# GSL
+find_package(GSL REQUIRED)
+mark_as_advanced(pkgcfg_lib_GSL_gsl pkgcfg_lib_GSL_gslcblas pkgcfg_lib_GSL_m)
+
+########################################################################################################################
+# HDF5
+find_package(HDF5)
+if( HDF5_FOUND )
+  mark_as_advanced(HDF5_C_LIBRARY_dl HDF5_C_LIBRARY_hdf5 HDF5_C_LIBRARY_m HDF5_C_LIBRARY_pthread HDF5_C_LIBRARY_z HDF5_C_LIBRARY_sz)
+endif()
+
+########################################################################################################################
+# floating point precision
+set (
+  CODE_PRECISION "DOUBLE"
+  CACHE STRING "Floating point type used for internal computations and FFTs"
+)
+set_property (
+  CACHE CODE_PRECISION
+  PROPERTY STRINGS FLOAT DOUBLE LONGDOUBLE
+)
+
 ########################################################################################################################
 # Add a custom command that produces version.cc, plus
 # a dummy output that's not actually produced, in order
@@ -68,16 +115,6 @@ ADD_CUSTOM_COMMAND(
     COMMAND ${CMAKE_COMMAND} -P
             ${CMAKE_CURRENT_SOURCE_DIR}/version.cmake)
 
-
-
-########################################################################################################################
-# GSL
-find_package(GSL REQUIRED)
-
-########################################################################################################################
-# HDF5
-find_package(HDF5)
-
 ########################################################################################################################
 # INCLUDES
 include_directories(${PROJECT_SOURCE_DIR}/src)
@@ -112,40 +149,38 @@ list (APPEND SOURCES
 # target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
 endif(ENABLE_PANPHASIA)
 
+# project configuration header
+configure_file(
+  ${PROJECT_SOURCE_DIR}/src/cmake_config.hh.in
+  ${PROJECT_SOURCE_DIR}/src/cmake_config.hh
+)
+
 add_executable(${PRGNAME} ${SOURCES} ${PLUGINS})
 
-set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 11)
+set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 14)
 
-if(FFTW3_FOUND)
-  target_compile_options(${PRGNAME} PRIVATE "-DFFTW3")
-
-  if( MUSIC_ENABLE_SINGLE_PRECISION )
-    target_compile_options(${PRGNAME} PRIVATE "-DSINGLE_PRECISION")
-    if (FFTW3_SINGLE_THREADS_FOUND)
-      target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_THREADS_LIBRARY})
-      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
-    elseif(FFTW3_SINGLE_SERIAL_FOUND)
-      target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_SERIAL_LIBRARY})
-      message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support")
-    else()  
-      message( FATAL "chose compilation in single precision, but FFTW3 not found for single precision")
-    endif()
-  else(MUSIC_ENABLE_SINGLE_PRECISION)
-    if (FFTW3_DOUBLE_THREADS_FOUND)
-      target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_THREADS_LIBRARY})
-      target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS")
-    elseif(FFTW3_DOUBLE_SERIAL_FOUND)
-      target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_SERIAL_LIBRARY})
-      message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support")
-    else()  
-      message( FATAL "chose compilation in double precision, but FFTW3 not found for double precision")
-    endif()
-  endif(MUSIC_ENABLE_SINGLE_PRECISION)
-endif(FFTW3_FOUND)
+if(CODE_PRECISION STREQUAL "FLOAT")
+  if(FFTW3_SINGLE_THREADS_FOUND)
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_THREADS)
+    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
+  endif()
+  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_SERIAL) 
+elseif(CODE_PRECISION STREQUAL "DOUBLE")
+  if(FFTW3_DOUBLE_THREADS_FOUND) 
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_THREADS)
+    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
+  endif()
+  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_SERIAL) 
+elseif(CODE_PRECISION STREQUAL "LONGDOUBLE")
+  if(FFTW3_LONGDOUBLE_THREADS_FOUND) 
+    target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_THREADS)
+    target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
+  endif()
+  target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_SERIAL)
+endif()
 
 if(HDF5_FOUND)
-  # target_link_libraries(${PRGNAME} ${HDF5_C_LIBRARY_DIRS})
-  target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
+  target_link_libraries(${PRGNAME} PRIVATE ${HDF5_LIBRARIES})
   target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIRS})
   target_compile_options(${PRGNAME} PRIVATE "-DHAVE_HDF5")
   target_compile_options(${PRGNAME} PRIVATE "-DH5_USE_16_API")
@@ -161,11 +196,5 @@ if(ENABLE_PANPHASIA)
   target_compile_options(${PRGNAME} PRIVATE "-DHAVE_PANPHASIA")
 endif(ENABLE_PANPHASIA)
 
-target_link_libraries(${PRGNAME} ${FFTW3_LIBRARIES})
-target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIRS})
+target_link_libraries(${PRGNAME} PRIVATE GSL::gsl)
 
-target_link_libraries(${PRGNAME} ${GSL_LIBRARIES})
-target_include_directories(${PRGNAME} PRIVATE ${GSL_INCLUDE_DIR})
-
-target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
-target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIR})
diff --git a/FindFFTW3.cmake b/FindFFTW3.cmake
index 0c65570..b69d975 100644
--- a/FindFFTW3.cmake
+++ b/FindFFTW3.cmake
@@ -230,3 +230,14 @@ find_package_handle_standard_args(FFTW3
     VERSION_VAR FFTW3_VERSION_STRING
     HANDLE_COMPONENTS
 )
+
+if(FFTW3_FOUND)
+  foreach(component ${FFTW3_FIND_COMPONENTS})
+    if(NOT TARGET FFTW3::FFTW3_${component})
+      add_library(FFTW3::FFTW3_${component} UNKNOWN IMPORTED)
+      set_target_properties(FFTW3::FFTW3_${component} PROPERTIES 
+        IMPORTED_LOCATION "${FFTW3_${component}_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES ${FFTW3_INCLUDE_DIR})
+    endif()
+  endforeach()
+endif()
diff --git a/src/TransferFunction.hh b/src/TransferFunction.hh
index 1cce3fb..919ee86 100644
--- a/src/TransferFunction.hh
+++ b/src/TransferFunction.hh
@@ -130,7 +130,7 @@ protected:
 		
 		double fftnorm = 1.0/N;
 		
-		fftw_complex in[N], out[N];
+		complex_t in[N], out[N];
 		fftw_plan p,ip;
 		
 		//... perform anti-ringing correction from Hamilton (2000)
diff --git a/src/cmake_config.hh.in b/src/cmake_config.hh.in
new file mode 100644
index 0000000..5162967
--- /dev/null
+++ b/src/cmake_config.hh.in
@@ -0,0 +1,25 @@
+#pragma once
+
+
+#define USE_PRECISION_${CODE_PRECISION}
+
+#ifdef __cplusplus
+constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
+#if defined(USE_PRECISION_FLOAT)
+  constexpr char CMAKE_PRECISION_STR[] = "single";
+#elif defined(USE_PRECISION_DOUBLE)
+  constexpr char CMAKE_PRECISION_STR[] = "double";
+#elif defined(USE_PRECISION_LONGDOUBLE)
+  constexpr char CMAKE_PRECISION_STR[] = "long double";
+#endif 
+
+// These variables are autogenerated and compiled
+// into the library by the version.cmake script. do not touch!
+extern "C"
+{
+  extern const char *GIT_TAG;
+  extern const char *GIT_REV;
+  extern const char *GIT_BRANCH;
+
+}
+#endif // __cplusplus
\ No newline at end of file
diff --git a/src/constraints.cc b/src/constraints.cc
index 479e282..a07d4d3 100644
--- a/src/constraints.cc
+++ b/src/constraints.cc
@@ -267,7 +267,7 @@ constraint_set::constraint_set( config_file& cf, transfer_function *ptf )
 }
 
 
-void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw )
+void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, complex_t* cw )
 {
 	double lsub = nx*dx;
 	double dk = 2.0*M_PI/lsub, d3k=dk*dk*dk;
@@ -374,7 +374,7 @@ void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t
 
 
 
-void constraint_set::wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 )
+void constraint_set::wnoise_constr_corr( double dx, complex_t* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 )
 {
 	size_t nconstr = cset_.size();
 	size_t nzp=nz/2+1;
diff --git a/src/constraints.hh b/src/constraints.hh
index a5029f3..66f2871 100644
--- a/src/constraints.hh
+++ b/src/constraints.hh
@@ -1,118 +1,121 @@
 /*
- 
- constraints.hh - This file is part of MUSIC -
- a code to generate multi-scale initial conditions 
- for cosmological simulations 
- 
- Copyright (C) 2010  Oliver Hahn
- 
- */
 
-#ifndef __CONSTRAINTS_HH
-#define __CONSTRAINTS_HH
+ constraints.hh - This file is part of MUSIC -
+ a code to generate multi-scale initial conditions
+ for cosmological simulations
+
+ Copyright (C) 2010  Oliver Hahn
+
+ */
+#pragma once
 
 #include <vector>
 #include <complex>
 
 #include <gsl/gsl_linalg.h>
 
-#include "general.hh"
-#include "config_file.hh"
-#include "transfer_function.hh"
-#include "cosmology.hh"
-
+#include <general.hh>
+#include <config_file.hh>
+#include <transfer_function.hh>
+#include <cosmology.hh>
 
 //! matrix class serving as a gsl wrapper
 class matrix
 {
 protected:
-	gsl_matrix * m_;
-	//double *data_;
+	gsl_matrix *m_;
+	// double *data_;
 	size_t M_, N_;
-	
+
 public:
-	matrix( size_t M, size_t N )
-	: M_(M), N_(N)
+	matrix(size_t M, size_t N)
+			: M_(M), N_(N)
 	{
-		m_ = gsl_matrix_alloc(M_,N_);
+		m_ = gsl_matrix_alloc(M_, N_);
 	}
-	
-	matrix( size_t N )
-	: M_(N), N_(N)
+
+	matrix(size_t N)
+			: M_(N), N_(N)
 	{
-		m_ = gsl_matrix_alloc(M_,N_);
+		m_ = gsl_matrix_alloc(M_, N_);
 	}
-	
-	matrix( const matrix& o )
+
+	matrix(const matrix &o)
 	{
 		M_ = o.M_;
 		N_ = o.N_;
-		m_ = gsl_matrix_alloc(M_,N_);
-		gsl_matrix_memcpy(m_, o.m_ );
+		m_ = gsl_matrix_alloc(M_, N_);
+		gsl_matrix_memcpy(m_, o.m_);
 	}
-	
+
 	~matrix()
 	{
-		gsl_matrix_free( m_ );
+		gsl_matrix_free(m_);
 	}
-	
-	double& operator()( size_t i, size_t j )
-	{	return *gsl_matrix_ptr( m_, i, j );	}
-	
-	const double& operator()( size_t i, size_t j ) const
-	{	return *gsl_matrix_const_ptr( m_, i, j );	}
-	
-	matrix& operator=( const matrix& o )
+
+	double &operator()(size_t i, size_t j)
 	{
-		gsl_matrix_free( m_ );
-		
+		return *gsl_matrix_ptr(m_, i, j);
+	}
+
+	const double &operator()(size_t i, size_t j) const
+	{
+		return *gsl_matrix_const_ptr(m_, i, j);
+	}
+
+	matrix &operator=(const matrix &o)
+	{
+		gsl_matrix_free(m_);
+
 		M_ = o.M_;
 		N_ = o.N_;
-		m_ = gsl_matrix_alloc(M_,N_);
-		gsl_matrix_memcpy(m_, o.m_ );
+		m_ = gsl_matrix_alloc(M_, N_);
+		gsl_matrix_memcpy(m_, o.m_);
 		return *this;
 	}
-	
-	
-	matrix& invert()
+
+	matrix &invert()
 	{
-		if( M_!=N_ )
+		if (M_ != N_)
 			throw std::runtime_error("Attempt to invert a non-square matrix!");
-		
+
 		int s;
-		gsl_matrix* im = gsl_matrix_alloc(M_,N_);
-		
-		gsl_permutation * p = gsl_permutation_alloc (M_);
-		gsl_linalg_LU_decomp( m_, p, &s );
-		gsl_linalg_LU_invert( m_, p, im );
-		
+		gsl_matrix *im = gsl_matrix_alloc(M_, N_);
+
+		gsl_permutation *p = gsl_permutation_alloc(M_);
+		gsl_linalg_LU_decomp(m_, p, &s);
+		gsl_linalg_LU_invert(m_, p, im);
+
 		gsl_matrix_memcpy(m_, im);
-		
+
 		gsl_permutation_free(p);
 		gsl_matrix_free(im);
 		return *this;
 	}
 };
 
-
 //! class to impose constraints on the white noise field (van de Weygaert & Bertschinger 1996)
 class constraint_set
 {
-	
+
 public:
-	enum constr_type{ halo, peak };
-	
+	enum constr_type
+	{
+		halo,
+		peak
+	};
+
 protected:
-	
-	struct constraint{
+	struct constraint
+	{
 		constr_type type;
-		double x,y,z;
-		double gx,gy,gz;
+		double x, y, z;
+		double gx, gy, gz;
 		double Rg, Rg2;
 		double gRg, gRg2;
 		double sigma;
 	};
-	
+
 	config_file *pcf_;
 	std::vector<constraint> cset_;
 	transfer_function *ptf_;
@@ -120,303 +123,185 @@ protected:
 	Cosmology *pcosmo_;
 	double dplus0_;
 	unsigned constr_level_;
-	
-	
-	inline std::complex<double> eval_constr( size_t icon, double kx, double ky, double kz )
+
+	inline std::complex<double> eval_constr(size_t icon, double kx, double ky, double kz)
 	{
 		double re, im, kdotx, k2;
-		
-		kdotx = cset_[icon].gx*kx+cset_[icon].gy*ky+cset_[icon].gz*kz;
-		k2    = kx*kx+ky*ky+kz*kz;
-		
-		re  = im = exp(-k2*cset_[icon].gRg2/2.0);
-		re *= cos( kdotx );
-		im *= sin( kdotx );
-		
-		return std::complex<double>(re,im);
+
+		kdotx = cset_[icon].gx * kx + cset_[icon].gy * ky + cset_[icon].gz * kz;
+		k2 = kx * kx + ky * ky + kz * kz;
+
+		re = im = exp(-k2 * cset_[icon].gRg2 / 2.0);
+		re *= cos(kdotx);
+		im *= sin(kdotx);
+
+		return std::complex<double>(re, im);
 	}
-	
-	
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
-	
 	//! apply constraints to the white noise
-	void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftwf_complex* cw );
-	
+	void wnoise_constr_corr(double dx, size_t nx, size_t ny, size_t nz, std::vector<double> &g0, matrix &cinv, complex_t *cw);
+
 	//! measure sigma for each constraint in the unconstrained noise
-	void wnoise_constr_corr( double dx, fftwf_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 );
-	
-#else
-	//! apply constraints to the white noise
-	void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw );
-	
-	//! measure sigma for each constraint in the unconstrained noise
-	void wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 );
-	
-#endif
-	
+	void wnoise_constr_corr(double dx, complex_t *cw, size_t nx, size_t ny, size_t nz, std::vector<double> &g0);
+
 	//! compute the covariance between the constraints
-	void icov_constr( double dx, size_t nx, size_t ny, size_t nz, matrix& cij );
-	
-	
+	void icov_constr(double dx, size_t nx, size_t ny, size_t nz, matrix &cij);
+
 public:
-	
-	
-	//! constructor 
-	constraint_set( config_file& cf, transfer_function *ptf );
-	
+	//! constructor
+	constraint_set(config_file &cf, transfer_function *ptf);
+
 	//! destructor
 	~constraint_set()
 	{
 		delete pccalc_;
 		delete pcosmo_;
 	}
-	
-	
-	template< typename rng >
-	void apply( unsigned ilevel, int x0[], int lx[], rng* wnoise )
+
+	template <typename rng>
+	void apply(unsigned ilevel, int x0[], int lx[], rng *wnoise)
 	{
-		if( cset_.size() == 0 || constr_level_ != ilevel )
+		if (cset_.size() == 0 || constr_level_ != ilevel)
 			return;
-		
-		unsigned nlvl = 1<<ilevel;
-		double boxlength = pcf_->get_value<double>("setup","boxlength");
-		
+
+		unsigned nlvl = 1 << ilevel;
+		double boxlength = pcf_->get_value<double>("setup", "boxlength");
+
 		//... compute constraint coordinates for grid
-		for( size_t i=0; i<cset_.size(); ++i )
+		for (size_t i = 0; i < cset_.size(); ++i)
 		{
 			cset_[i].gx = cset_[i].x * (double)nlvl;
 			cset_[i].gy = cset_[i].y * (double)nlvl;
 			cset_[i].gz = cset_[i].z * (double)nlvl;
-			cset_[i].gRg = cset_[i].Rg/boxlength * (double)nlvl;
-			cset_[i].gRg2 = cset_[i].gRg*cset_[i].gRg;
-			
-			if(cset_[i].gRg > 0.5*lx[0])
-				music::wlog.Print("Constraint %d appears to be too large scale",i);
-		}
-		
-		
-		std::vector<double> g0;
-		
-//		unsigned levelmax = pcf_->get_value<unsigned>("setup","levelmax");
-		unsigned levelmin = pcf_->get_value<unsigned>("setup","levelmin_TF");
-		
-		bool bperiodic = ilevel==levelmin;
-		double dx = pcf_->get_value<double>("setup","boxlength")/(1<<ilevel);
-		
+			cset_[i].gRg = cset_[i].Rg / boxlength * (double)nlvl;
+			cset_[i].gRg2 = cset_[i].gRg * cset_[i].gRg;
 
-		music::ilog.Print("Computing constrained realization...");		
-		
-		if( bperiodic )
+			if (cset_[i].gRg > 0.5 * lx[0])
+				music::wlog.Print("Constraint %d appears to be too large scale", i);
+		}
+
+		std::vector<double> g0;
+
+		//		unsigned levelmax = pcf_->get_value<unsigned>("setup","levelmax");
+		unsigned levelmin = pcf_->get_value<unsigned>("setup", "levelmin_TF");
+
+		bool bperiodic = ilevel == levelmin;
+		double dx = pcf_->get_value<double>("setup", "boxlength") / (1 << ilevel);
+
+		music::ilog.Print("Computing constrained realization...");
+
+		if (bperiodic)
 		{
 			//... we are operating on the periodic coarse grid
-			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz+2;
-			fftw_real * w = new fftw_real[nx*ny*nzp];
-			
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
-			fftwf_plan	p  = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
-						ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
-	#else
-			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
-			fftw_plan	p  = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
-						ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
-	#endif
-#else
-			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
-			rfftwnd_plan p	= rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
-						 ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
-#endif
-			
-			double fftnorm = 1.0/sqrt(nx*ny*nz);
-			
-			#pragma omp parallel for
-			for( int i=0; i<(int)nx; i++ )
-				for( int j=0; j<(int)ny; j++ )
-					for( int k=0; k<(int)nz; k++ )
+			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
+			real_t *w = new real_t[nx * ny * nzp];
+
+			complex_t *cw = reinterpret_cast<complex_t *>(w);
+			fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
+									ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
+
+			double fftnorm = 1.0 / sqrt(nx * ny * nz);
+
+#pragma omp parallel for
+			for (int i = 0; i < (int)nx; i++)
+				for (int j = 0; j < (int)ny; j++)
+					for (int k = 0; k < (int)nz; k++)
 					{
-						size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k;
-						w[q] = (*wnoise)((x0[0]+i)%nx,(x0[1]+j)%ny,(x0[2]+k)%nz)*fftnorm;
+						size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
+						w[q] = (*wnoise)((x0[0] + i) % nx, (x0[1] + j) % ny, (x0[2] + k) % nz) * fftnorm;
 					}
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_execute( p );
-	#else
-			fftw_execute( p );
-	#endif
-#else
-#ifndef SINGLETHREAD_FFTW		
-			rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
-#else
-			rfftwnd_one_real_to_complex( p, w, NULL );
-#endif
-#endif
-			wnoise_constr_corr( dx, cw, nx, ny, nz, g0 );
-			
-			matrix c(2,2);
-			icov_constr( dx, nx, ny, nz, c );
-			
-			
-			wnoise_constr_corr( dx, nx, ny, nz, g0, c, cw );
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_execute( ip );
-	#else
-			fftw_execute( ip );
-	#endif
-#else
-#ifndef SINGLETHREAD_FFTW		
-			rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
-#else
-			rfftwnd_one_complex_to_real( ip, cw, NULL );
-#endif
-#endif
-			
-			#pragma omp parallel for
-			for( int i=0; i<(int)nx; i++ )
-				for( int j=0; j<(int)ny; j++ )
-					for( int k=0; k<(int)nz; k++ )
+
+			FFTW_API(execute)(p);
+			wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
+
+			matrix c(2, 2);
+			icov_constr(dx, nx, ny, nz, c);
+
+			wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
+
+			FFTW_API(execute)(ip);
+
+#pragma omp parallel for
+			for (int i = 0; i < (int)nx; i++)
+				for (int j = 0; j < (int)ny; j++)
+					for (int k = 0; k < (int)nz; k++)
 					{
-						size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k;
-						(*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k)) = w[q]*fftnorm;
+						size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
+						(*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) = w[q] * fftnorm;
 					}
-			
-			music::ilog.Print("Applied constraints to level %d.",ilevel);
-			
-						
+
+			music::ilog.Print("Applied constraints to level %d.", ilevel);
+
 			delete[] w;
-			
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_destroy_plan(p);
-	#else
-			fftw_destroy_plan(p);
-	#endif
-#else
-			fftwnd_destroy_plan(p);
-#endif
-		}else{
-			
+
+			FFTW_API(destroy_plan)(p);
+			FFTW_API(destroy_plan)(ip);
+		}
+		else
+		{
+
 			//... we are operating on a refinement grid, not necessarily the finest
-			
-			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz+2;
-			fftw_real * w = new fftw_real[nx*ny*nzp];
-			
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
-			fftwf_plan	p  = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
-						ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
-	#else
-			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
-			fftw_plan	p  = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
-						ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
-	#endif
-#else
-			fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
-			rfftwnd_plan p	= rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
-			ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
-#endif
-			
-			double fftnorm = 1.0/sqrt(nx*ny*nz);
-			
-			int il = nx/4, ir = 3*nx/4, jl=ny/4, jr = 3*ny/4, kl = nz/4, kr = 3*nz/4;
-			
-			#pragma omp parallel for
-			for( int i=0; i<(int)nx; i++ )
-				for( int j=0; j<(int)ny; j++ )
-					for( int k=0; k<(int)nz; k++ )
+
+			size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
+			real_t *w = new real_t[nx * ny * nzp];
+
+			complex_t *cw = reinterpret_cast<complex_t *>(w);
+			fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
+									ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
+
+			double fftnorm = 1.0 / sqrt(nx * ny * nz);
+
+			int il = nx / 4, ir = 3 * nx / 4, jl = ny / 4, jr = 3 * ny / 4, kl = nz / 4, kr = 3 * nz / 4;
+
+#pragma omp parallel for
+			for (int i = 0; i < (int)nx; i++)
+				for (int j = 0; j < (int)ny; j++)
+					for (int k = 0; k < (int)nz; k++)
 					{
-						size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k;
-						
-						if( i>=il && i<ir && j>=jl && j<jr && k>=kl && k<kr )
-							w[q] = (*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k))*fftnorm;
+						size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
+
+						if (i >= il && i < ir && j >= jl && j < jr && k >= kl && k < kr)
+							w[q] = (*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) * fftnorm;
 						else
 							w[q] = 0.0;
-
 					}
-			
-			int nlvl05 = 1<<(ilevel-1);
-			int xs = nlvl05-x0[0], ys = nlvl05-x0[1], zs = nlvl05-x0[2];
-			
-			for( size_t i=0; i<cset_.size(); ++i )
+
+			int nlvl05 = 1 << (ilevel - 1);
+			int xs = nlvl05 - x0[0], ys = nlvl05 - x0[1], zs = nlvl05 - x0[2];
+
+			for (size_t i = 0; i < cset_.size(); ++i)
 			{
 				cset_[i].gx -= xs;
 				cset_[i].gy -= ys;
 				cset_[i].gz -= zs;
 			}
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_execute( p );
-	#else
-			fftw_execute( p );
-	#endif
-#else
-#ifndef SINGLETHREAD_FFTW		
-			rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
-#else
-			rfftwnd_one_real_to_complex( p, w, NULL );
-#endif
-#endif
-			wnoise_constr_corr( dx, cw, nx, ny, nz, g0 );
-			
-			matrix c(2,2);
-			icov_constr( dx, nx, ny, nz, c );
-			
-			
-			wnoise_constr_corr( dx, nx, ny, nz, g0, c, cw );
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_execute( ip );
-	#else
-			fftw_execute( ip );
-	#endif
-#else
-#ifndef SINGLETHREAD_FFTW		
-			rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
-#else
-			rfftwnd_one_complex_to_real( ip, cw, NULL );
-#endif
-#endif
-			
-			#pragma omp parallel for
-			for( int i=0; i<(int)nx; i++ )
-				for( int j=0; j<(int)ny; j++ )
-					for( int k=0; k<(int)nz; k++ )
+
+			FFTW_API(execute)(p);
+
+			wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
+
+			matrix c(2, 2);
+			icov_constr(dx, nx, ny, nz, c);
+
+			wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
+
+			FFTW_API(execute)(ip);
+
+#pragma omp parallel for
+			for (int i = 0; i < (int)nx; i++)
+				for (int j = 0; j < (int)ny; j++)
+					for (int k = 0; k < (int)nz; k++)
 					{
-						size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k;
-						if( i>=il && i<ir && j>=jl && j<jr && k>=kl && k<kr )
-							(*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k)) = w[q]*fftnorm;
+						size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
+						if (i >= il && i < ir && j >= jl && j < jr && k >= kl && k < kr)
+							(*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) = w[q] * fftnorm;
 					}
-			
 
-			music::ilog.Print("Applied constraints to level %d.",ilevel);	
-			
+			music::ilog.Print("Applied constraints to level %d.", ilevel);
+
 			delete[] w;
-			
-			
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-			fftwf_destroy_plan(p);
-	#else
-			fftw_destroy_plan(p);
-	#endif
-#else
-			fftwnd_destroy_plan(p);
-#endif
-			
+
+			FFTW_API(destroy_plan)(p);
+			FFTW_API(destroy_plan)(ip);
 		}
-		
 	}
-	
 };
-
-
-#endif // __CONSTRAINTS_HH
diff --git a/src/convolution_kernel.cc b/src/convolution_kernel.cc
index 8bec532..9b4acb1 100644
--- a/src/convolution_kernel.cc
+++ b/src/convolution_kernel.cc
@@ -7,13 +7,9 @@
  
 */
 
-#include "general.hh"
-#include "densities.hh"
-#include "convolution_kernel.hh"
-
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
-typedef fftw_complex fftwf_complex;
-#endif
+#include <general.hh>
+#include <densities.hh>
+#include <convolution_kernel.hh>
 
 namespace convolution
 {
@@ -25,7 +21,6 @@ get_kernel_map()
 	return kernel_map;
 }
 
-template <typename real_t>
 void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 {
 	//return;
@@ -34,49 +29,26 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 	double fftnormp = 1.0/sqrt((double)cparam_.nx * (double)cparam_.ny * (double)cparam_.nz);
 	double fftnorm = pow(2.0 * M_PI, 1.5) / sqrt(cparam_.lx * cparam_.ly * cparam_.lz) * fftnormp;
 
-	fftw_complex *cdata;
-	[[maybe_unused]] fftw_complex *ckernel;
-	fftw_real *data;
+	complex_t *cdata;
+	[[maybe_unused]] complex_t *ckernel;
+	real_t *data;
 
-	data = reinterpret_cast<fftw_real *>(pd);
-	cdata = reinterpret_cast<fftw_complex *>(data);
-	ckernel = reinterpret_cast<fftw_complex *>(pk->get_ptr());
+	data = reinterpret_cast<real_t *>(pd);
+	cdata = reinterpret_cast<complex_t *>(data);
+	ckernel = reinterpret_cast<complex_t *>(pk->get_ptr());
 
 	std::cout << "   - Performing density convolution... ("
 			  << cparam_.nx << ", " << cparam_.ny << ", " << cparam_.nz << ")\n";
 
 	music::ulog.Print("Performing kernel convolution on (%5d,%5d,%5d) grid", cparam_.nx, cparam_.ny, cparam_.nz);
 	music::ulog.Print("Performing forward FFT...");
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan plan, iplan;
-	plan = fftwf_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
-	iplan = fftwf_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
 
-	fftwf_execute(plan);
-#else
-	fftw_plan plan, iplan;
-	plan = fftw_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
-	iplan = fftw_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
+	fftw_plan_t plan, iplan;
+	plan = FFTW_API(plan_dft_r2c_3d)(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
+	iplan = FFTW_API(plan_dft_c2r_3d)(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
 
-	fftw_execute(plan);
-#endif
-#else
-	rfftwnd_plan iplan, plan;
+	FFTW_API(execute)(plan);
 
-	plan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
-							   FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-	iplan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
-								FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
-#else
-	rfftwnd_one_real_to_complex(plan, data, NULL);
-#endif
-
-#endif
 	//..... need a phase shift for baryons for SPH
 	double dstag = 0.0;
 
@@ -163,27 +135,9 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 
 	music::ulog.Print("Performing backward FFT...");
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(iplan);
-	fftwf_destroy_plan(plan);
-	fftwf_destroy_plan(iplan);
-#else
-	fftw_execute(iplan);
-	fftw_destroy_plan(plan);
-	fftw_destroy_plan(iplan);
-
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
-#else
-	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
-#endif
-
-	rfftwnd_destroy_plan(plan);
-	rfftwnd_destroy_plan(iplan);
-#endif
+	FFTW_API(execute)(iplan);
+	FFTW_API(destroy_plan)(plan);
+	FFTW_API(destroy_plan)(iplan);
 
 	// set the DC mode here to avoid a possible truncation error in single precision
 	{
@@ -196,14 +150,12 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
 	}
 }
 
-template void perform<double>(kernel *pk, void *pd, bool shift, bool fix, bool flip);
-template void perform<float>(kernel *pk, void *pd, bool shift, bool fix, bool flip);
+void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
 
 /*****************************************************************************************/
 /***    SPECIFIC KERNEL IMPLEMENTATIONS      *********************************************/
 /*****************************************************************************************/
 
-template <typename real_t>
 class kernel_k : public kernel
 {
 protected:
@@ -298,8 +250,4 @@ public:
 /**************************************************************************************/
 /**************************************************************************************/
 
-namespace
-{
-convolution::kernel_creator_concrete<convolution::kernel_k<double>> creator_kd("tf_kernel_k_double");
-convolution::kernel_creator_concrete<convolution::kernel_k<float>> creator_kf("tf_kernel_k_float");
-} // namespace
+convolution::kernel_creator_concrete<convolution::kernel_k> creator_kd("tf_kernel_k");
diff --git a/src/convolution_kernel.hh b/src/convolution_kernel.hh
index 4fe13cd..30bb851 100644
--- a/src/convolution_kernel.hh
+++ b/src/convolution_kernel.hh
@@ -112,7 +112,6 @@ struct kernel_creator_concrete : public kernel_creator
 };
 
 //! actual implementation of the FFT convolution (independent of the actual kernel)
-template <typename real_t>
 void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
 
 } //namespace convolution
diff --git a/src/cosmology.cc b/src/cosmology.cc
index 7286733..73e6a5f 100644
--- a/src/cosmology.cc
+++ b/src/cosmology.cc
@@ -1,11 +1,11 @@
 /*
- 
+
  cosmology.cc - This file is part of MUSIC -
- a code to generate multi-scale initial conditions 
- for cosmological simulations 
- 
+ a code to generate multi-scale initial conditions
+ for cosmological simulations
+
  Copyright (C) 2010  Oliver Hahn
- 
+
  */
 
 #include "cosmology.hh"
@@ -13,301 +13,252 @@
 #include "mg_operators.hh"
 #include "general.hh"
 
-#define ACC(i,j,k) ((*u.get_grid((ilevel)))((i),(j),(k)))
-#define SQR(x)	((x)*(x))
+#define ACC(i, j, k) ((*u.get_grid((ilevel)))((i), (j), (k)))
+#define SQR(x) ((x) * (x))
 
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
-#define fftw_complex fftwf_complex
-#endif
-
-
-void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order )
+void compute_LLA_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
 {
 	fnew = u;
-	
-	for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel )
-	{
-		double h = pow(2.0,ilevel), h2 = h*h, h2_4 = 0.25*h2;
-		meshvar_bnd *pvar = fnew.get_grid(ilevel);
-		
-		
-		if( order == 2 )
-		{
-			#pragma omp parallel for //reduction(+:sum_corr,sum,sum2)
-			for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-				for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-					for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
-					{
-						double D[3][3];
-						
-						D[0][0] = (ACC(ix-1,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+1,iy,iz)) * h2;
-						D[1][1] = (ACC(ix,iy-1,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+1,iz)) * h2;
-						D[2][2] = (ACC(ix,iy,iz-1)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+1)) * h2;
-											
-						D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4;
-						D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4;
-						D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4;
-						
-						D[0][0] += 1.0;
-						D[1][1] += 1.0;
-						D[2][2] += 1.0;
-						
-						double det = D[0][0]*D[1][1]*D[2][2]
-						-	D[0][0]*D[1][2]*D[2][1]
-						-   D[1][0]*D[0][1]*D[2][2]
-						+	D[1][0]*D[0][2]*D[1][2]
-						+	D[2][0]*D[0][1]*D[1][2]
-						-	D[2][0]*D[0][2]*D[1][1];
-						
-						(*pvar)(ix,iy,iz) = 1.0/det-1.0;
-						
-					}
-		}
-		else if ( order == 4 )
-		{
-			#pragma omp parallel for 
-			for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-				for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-					for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
-					{
-						double D[3][3];
-						
-						D[0][0] = (-ACC(ix-2,iy,iz)+16.*ACC(ix-1,iy,iz)-30.0*ACC(ix,iy,iz)+16.*ACC(ix+1,iy,iz)-ACC(ix+2,iy,iz)) * h2/12.0;
-						D[1][1] = (-ACC(ix,iy-2,iz)+16.*ACC(ix,iy-1,iz)-30.0*ACC(ix,iy,iz)+16.*ACC(ix,iy+1,iz)-ACC(ix,iy+2,iz)) * h2/12.0;
-						D[2][2] = (-ACC(ix,iy,iz-2)+16.*ACC(ix,iy,iz-1)-30.0*ACC(ix,iy,iz)+16.*ACC(ix,iy,iz+1)-ACC(ix,iy,iz+2)) * h2/12.0;
-						
-						D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4;
-						D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4;
-						D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4;
-						
-						
-						D[0][0] += 1.0;
-						D[1][1] += 1.0;
-						D[2][2] += 1.0;
-						
-						double det = D[0][0]*D[1][1]*D[2][2]
-						-	D[0][0]*D[1][2]*D[2][1]
-						-   D[1][0]*D[0][1]*D[2][2]
-						+	D[1][0]*D[0][2]*D[1][2]
-						+	D[2][0]*D[0][1]*D[1][2]
-						-	D[2][0]*D[0][2]*D[1][1];
-						
-						(*pvar)(ix,iy,iz) = 1.0/det-1.0;
-						
-					}
-		}
-		else if ( order == 6 )
-		{
-			h2_4/=36.;
-			h2/=180.;
-			#pragma omp parallel for 
-			for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-				for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-					for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
-					{
-						double D[3][3];
-						
-						D[0][0] = (2.*ACC(ix-3,iy,iz)-27.*ACC(ix-2,iy,iz)+270.*ACC(ix-1,iy,iz)-490.0*ACC(ix,iy,iz)+270.*ACC(ix+1,iy,iz)-27.*ACC(ix+2,iy,iz)+2.*ACC(ix+3,iy,iz)) * h2;
-						D[1][1] = (2.*ACC(ix,iy-3,iz)-27.*ACC(ix,iy-2,iz)+270.*ACC(ix,iy-1,iz)-490.0*ACC(ix,iy,iz)+270.*ACC(ix,iy+1,iz)-27.*ACC(ix,iy+2,iz)+2.*ACC(ix,iy+3,iz)) * h2;
-						D[2][2] = (2.*ACC(ix,iy,iz-3)-27.*ACC(ix,iy,iz-2)+270.*ACC(ix,iy,iz-1)-490.0*ACC(ix,iy,iz)+270.*ACC(ix,iy,iz+1)-27.*ACC(ix,iy,iz+2)+2.*ACC(ix,iy,iz+3)) * h2;
-						
-						//.. this is actually 8th order accurate
-						D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))
-											 -8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz)
-												+ ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz))
-											 +1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_4;
-						D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
-											 -8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
-												+ ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
-											 +1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_4;
-						D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
-											 -8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
-												+ ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
-											 +1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_4;
-						
-						D[0][0] += 1.0;
-						D[1][1] += 1.0;
-						D[2][2] += 1.0;
-						
-						double det = D[0][0]*D[1][1]*D[2][2]
-						-	D[0][0]*D[1][2]*D[2][1]
-						-   D[1][0]*D[0][1]*D[2][2]
-						+	D[1][0]*D[0][2]*D[1][2]
-						+	D[2][0]*D[0][1]*D[1][2]
-						-	D[2][0]*D[0][2]*D[1][1];
-						
-						(*pvar)(ix,iy,iz) = 1.0/det-1.0;
-						
-					}
-			
-		}else
-			throw std::runtime_error("compute_LLA_density : invalid operator order specified");
 
+	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
+	{
+		double h = pow(2.0, ilevel), h2 = h * h, h2_4 = 0.25 * h2;
+		meshvar_bnd *pvar = fnew.get_grid(ilevel);
+
+		if (order == 2)
+		{
+#pragma omp parallel for // reduction(+:sum_corr,sum,sum2)
+			for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+				for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+					for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
+					{
+						double D[3][3];
+
+						D[0][0] = (ACC(ix - 1, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 1, iy, iz)) * h2;
+						D[1][1] = (ACC(ix, iy - 1, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 1, iz)) * h2;
+						D[2][2] = (ACC(ix, iy, iz - 1) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 1)) * h2;
+
+						D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
+						D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
+						D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
+
+						D[0][0] += 1.0;
+						D[1][1] += 1.0;
+						D[2][2] += 1.0;
+
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
+
+						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
+					}
+		}
+		else if (order == 4)
+		{
+#pragma omp parallel for
+			for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+				for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+					for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
+					{
+						double D[3][3];
+
+						D[0][0] = (-ACC(ix - 2, iy, iz) + 16. * ACC(ix - 1, iy, iz) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix + 1, iy, iz) - ACC(ix + 2, iy, iz)) * h2 / 12.0;
+						D[1][1] = (-ACC(ix, iy - 2, iz) + 16. * ACC(ix, iy - 1, iz) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix, iy + 1, iz) - ACC(ix, iy + 2, iz)) * h2 / 12.0;
+						D[2][2] = (-ACC(ix, iy, iz - 2) + 16. * ACC(ix, iy, iz - 1) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix, iy, iz + 1) - ACC(ix, iy, iz + 2)) * h2 / 12.0;
+
+						D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
+						D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
+						D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
+
+						D[0][0] += 1.0;
+						D[1][1] += 1.0;
+						D[2][2] += 1.0;
+
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
+
+						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
+					}
+		}
+		else if (order == 6)
+		{
+			h2_4 /= 36.;
+			h2 /= 180.;
+#pragma omp parallel for
+			for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+				for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+					for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
+					{
+						double D[3][3];
+
+						D[0][0] = (2. * ACC(ix - 3, iy, iz) - 27. * ACC(ix - 2, iy, iz) + 270. * ACC(ix - 1, iy, iz) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix + 1, iy, iz) - 27. * ACC(ix + 2, iy, iz) + 2. * ACC(ix + 3, iy, iz)) * h2;
+						D[1][1] = (2. * ACC(ix, iy - 3, iz) - 27. * ACC(ix, iy - 2, iz) + 270. * ACC(ix, iy - 1, iz) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix, iy + 1, iz) - 27. * ACC(ix, iy + 2, iz) + 2. * ACC(ix, iy + 3, iz)) * h2;
+						D[2][2] = (2. * ACC(ix, iy, iz - 3) - 27. * ACC(ix, iy, iz - 2) + 270. * ACC(ix, iy, iz - 1) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix, iy, iz + 1) - 27. * ACC(ix, iy, iz + 2) + 2. * ACC(ix, iy, iz + 3)) * h2;
+
+						//.. this is actually 8th order accurate
+						D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_4;
+						D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_4;
+						D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_4;
+
+						D[0][0] += 1.0;
+						D[1][1] += 1.0;
+						D[2][2] += 1.0;
+
+						double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
+
+						(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
+					}
+		}
+		else
+			throw std::runtime_error("compute_LLA_density : invalid operator order specified");
 	}
-	
 }
 
-
-void compute_Lu_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order )
+void compute_Lu_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
 {
 	fnew = u;
-	
-	for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel )
+
+	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0,ilevel), h2 = h*h;
+		double h = pow(2.0, ilevel), h2 = h * h;
 		meshvar_bnd *pvar = fnew.get_grid(ilevel);
-		
-		#pragma omp parallel for
-		for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-			for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-				for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
+
+#pragma omp parallel for
+		for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+			for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+				for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
 				{
 					double D[3][3];
-					
-					D[0][0] = 1.0 + (ACC(ix-1,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+1,iy,iz)) * h2;
-					D[1][1] = 1.0 + (ACC(ix,iy-1,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+1,iz)) * h2;
-					D[2][2] = 1.0 + (ACC(ix,iy,iz-1)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+1)) * h2;
-					
-					(*pvar)(ix,iy,iz) = -(D[0][0]+D[1][1]+D[2][2] - 3.0);
-					
+
+					D[0][0] = 1.0 + (ACC(ix - 1, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 1, iy, iz)) * h2;
+					D[1][1] = 1.0 + (ACC(ix, iy - 1, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 1, iz)) * h2;
+					D[2][2] = 1.0 + (ACC(ix, iy, iz - 1) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 1)) * h2;
+
+					(*pvar)(ix, iy, iz) = -(D[0][0] + D[1][1] + D[2][2] - 3.0);
 				}
 	}
-	
 }
 
-
-void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hierarchy& fnew )
+void compute_2LPT_source_FFT(config_file &cf_, const grid_hierarchy &u, grid_hierarchy &fnew)
 {
-	if( u.levelmin() != u.levelmax() )
+	if (u.levelmin() != u.levelmax())
 		throw std::runtime_error("FFT 2LPT can only be run in Unigrid mode!");
-	
+
 	fnew = u;
-	size_t nx,ny,nz,nzp;
+	size_t nx, ny, nz, nzp;
 	nx = u.get_grid(u.levelmax())->size(0);
 	ny = u.get_grid(u.levelmax())->size(1);
 	nz = u.get_grid(u.levelmax())->size(2);
-	nzp = 2*(nz/2+1);
-	
+	nzp = 2 * (nz / 2 + 1);
+
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[nx*ny*nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex*> (data);
-	
-	fftw_complex	*cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33;
-	fftw_real		*data_11, *data_12, *data_13, *data_22, *data_23, *data_33;
-	
-	data_11 = new fftw_real[nx*ny*nzp]; cdata_11 = reinterpret_cast<fftw_complex*> (data_11);
-	data_12 = new fftw_real[nx*ny*nzp]; cdata_12 = reinterpret_cast<fftw_complex*> (data_12);
-	data_13 = new fftw_real[nx*ny*nzp]; cdata_13 = reinterpret_cast<fftw_complex*> (data_13);
-	data_22 = new fftw_real[nx*ny*nzp]; cdata_22 = reinterpret_cast<fftw_complex*> (data_22);
-	data_23 = new fftw_real[nx*ny*nzp]; cdata_23 = reinterpret_cast<fftw_complex*> (data_23);
-	data_33 = new fftw_real[nx*ny*nzp]; cdata_33 = reinterpret_cast<fftw_complex*> (data_33);
-	
-	#pragma omp parallel for
-	for( int i=0; i<(int)nx; ++i )
-		for( size_t j=0; j<ny; ++j )	
-			for( size_t k=0; k<nz; ++k )
+	real_t *data = new real_t[nx * ny * nzp];
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
+
+	complex_t *cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33;
+	real_t *data_11, *data_12, *data_13, *data_22, *data_23, *data_33;
+
+	data_11 = new real_t[nx * ny * nzp];
+	cdata_11 = reinterpret_cast<complex_t *>(data_11);
+	data_12 = new real_t[nx * ny * nzp];
+	cdata_12 = reinterpret_cast<complex_t *>(data_12);
+	data_13 = new real_t[nx * ny * nzp];
+	cdata_13 = reinterpret_cast<complex_t *>(data_13);
+	data_22 = new real_t[nx * ny * nzp];
+	cdata_22 = reinterpret_cast<complex_t *>(data_22);
+	data_23 = new real_t[nx * ny * nzp];
+	cdata_23 = reinterpret_cast<complex_t *>(data_23);
+	data_33 = new real_t[nx * ny * nzp];
+	cdata_33 = reinterpret_cast<complex_t *>(data_33);
+
+#pragma omp parallel for
+	for (int i = 0; i < (int)nx; ++i)
+		for (size_t j = 0; j < ny; ++j)
+			for (size_t k = 0; k < nz; ++k)
 			{
-				size_t idx = ((size_t)i*ny+j)*nzp+k;
-				data[idx] = (*u.get_grid(u.levelmax()))(i,j,k);
+				size_t idx = ((size_t)i * ny + j) * nzp + k;
+				data[idx] = (*u.get_grid(u.levelmax()))(i, j, k);
 			}
-	
+
 	//... perform FFT and Poisson solve................................
-#ifdef FFTW3
-	
-	#ifdef SINGLE_PRECISION
-	fftwf_plan
-		plan  = fftwf_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE),
-		iplan = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE),
-		ip11  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE),
-		ip12  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE),
-		ip13  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE),
-		ip22  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE),
-		ip23  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE),
-		ip33  = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
-	
-	fftwf_execute(plan);
-	
-	#else
-	
-	fftw_plan
-		plan  = fftw_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE),
-		iplan = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE),
-		ip11  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE),
-		ip12  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE),
-		ip13  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE),
-		ip22  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE),
-		ip23  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE),
-		ip33  = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
-	
-	fftw_execute(plan);
-	
-	#endif
-	
-	double kfac = 2.0*M_PI;
-	double norm = 1.0/((double)(nx*ny*nz));
-	
-	#pragma omp parallel for
-	for( int i=0; i<(int)nx; ++i )
-		for( size_t j=0; j<ny; ++j )	
-			for( size_t l=0; l<nz/2+1; ++l )
+
+	fftw_plan_t
+			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
+			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE),
+			ip11 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_11, data_11, FFTW_ESTIMATE),
+			ip12 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_12, data_12, FFTW_ESTIMATE),
+			ip13 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_13, data_13, FFTW_ESTIMATE),
+			ip22 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_22, data_22, FFTW_ESTIMATE),
+			ip23 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_23, data_23, FFTW_ESTIMATE),
+			ip33 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_33, data_33, FFTW_ESTIMATE);
+
+	FFTW_API(execute)
+	(plan);
+
+	double kfac = 2.0 * M_PI;
+	double norm = 1.0 / ((double)(nx * ny * nz));
+
+#pragma omp parallel for
+	for (int i = 0; i < (int)nx; ++i)
+		for (size_t j = 0; j < ny; ++j)
+			for (size_t l = 0; l < nz / 2 + 1; ++l)
 			{
-				int ii = i; if(ii>(int)nx/2) ii-=nx;
-				int jj = (int)j; if(jj>(int)ny/2) jj-=ny;
+				int ii = i;
+				if (ii > (int)nx / 2)
+					ii -= nx;
+				int jj = (int)j;
+				if (jj > (int)ny / 2)
+					jj -= ny;
 				double ki = (double)ii;
 				double kj = (double)jj;
 				double kk = (double)l;
-				
+
 				double k[3];
 				k[0] = (double)ki * kfac;
 				k[1] = (double)kj * kfac;
 				k[2] = (double)kk * kfac;
-				
-				size_t idx = ((size_t)i*ny+j)*nzp/2+l;
-				//double re = cdata[idx][0];
-				//double im = cdata[idx][1];
-				
-				cdata_11[idx][0] = -k[0]*k[0] * cdata[idx][0] * norm;
-				cdata_11[idx][1] = -k[0]*k[0] * cdata[idx][1] * norm;
-				
-				cdata_12[idx][0] = -k[0]*k[1] * cdata[idx][0] * norm;
-				cdata_12[idx][1] = -k[0]*k[1] * cdata[idx][1] * norm;
-				
-				cdata_13[idx][0] = -k[0]*k[2] * cdata[idx][0] * norm;
-				cdata_13[idx][1] = -k[0]*k[2] * cdata[idx][1] * norm;
-				
-				cdata_22[idx][0] = -k[1]*k[1] * cdata[idx][0] * norm;
-				cdata_22[idx][1] = -k[1]*k[1] * cdata[idx][1] * norm;
-				
-				cdata_23[idx][0] = -k[1]*k[2] * cdata[idx][0] * norm;
-				cdata_23[idx][1] = -k[1]*k[2] * cdata[idx][1] * norm;
-				
-				cdata_33[idx][0] = -k[2]*k[2] * cdata[idx][0] * norm;
-				cdata_33[idx][1] = -k[2]*k[2] * cdata[idx][1] * norm;
-				
-				
-				if( i==(int)nx/2||j==ny/2||l==nz/2)
+
+				size_t idx = ((size_t)i * ny + j) * nzp / 2 + l;
+				// double re = cdata[idx][0];
+				// double im = cdata[idx][1];
+
+				cdata_11[idx][0] = -k[0] * k[0] * cdata[idx][0] * norm;
+				cdata_11[idx][1] = -k[0] * k[0] * cdata[idx][1] * norm;
+
+				cdata_12[idx][0] = -k[0] * k[1] * cdata[idx][0] * norm;
+				cdata_12[idx][1] = -k[0] * k[1] * cdata[idx][1] * norm;
+
+				cdata_13[idx][0] = -k[0] * k[2] * cdata[idx][0] * norm;
+				cdata_13[idx][1] = -k[0] * k[2] * cdata[idx][1] * norm;
+
+				cdata_22[idx][0] = -k[1] * k[1] * cdata[idx][0] * norm;
+				cdata_22[idx][1] = -k[1] * k[1] * cdata[idx][1] * norm;
+
+				cdata_23[idx][0] = -k[1] * k[2] * cdata[idx][0] * norm;
+				cdata_23[idx][1] = -k[1] * k[2] * cdata[idx][1] * norm;
+
+				cdata_33[idx][0] = -k[2] * k[2] * cdata[idx][0] * norm;
+				cdata_33[idx][1] = -k[2] * k[2] * cdata[idx][1] * norm;
+
+				if (i == (int)nx / 2 || j == ny / 2 || l == nz / 2)
 				{
 					cdata_11[idx][0] = 0.0;
 					cdata_11[idx][1] = 0.0;
-					
+
 					cdata_12[idx][0] = 0.0;
 					cdata_12[idx][1] = 0.0;
-					
+
 					cdata_13[idx][0] = 0.0;
 					cdata_13[idx][1] = 0.0;
-					
+
 					cdata_22[idx][0] = 0.0;
 					cdata_22[idx][1] = 0.0;
-					
+
 					cdata_23[idx][0] = 0.0;
 					cdata_23[idx][1] = 0.0;
-					
+
 					cdata_33[idx][0] = 0.0;
 					cdata_33[idx][1] = 0.0;
 				}
-				
 			}
-	
+
 	delete[] data;
 	/*cdata_11[0][0]	= 0.0; cdata_11[0][1]	= 0.0;
 	 cdata_12[0][0]	= 0.0; cdata_12[0][1]	= 0.0;
@@ -315,175 +266,38 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 	 cdata_22[0][0]	= 0.0; cdata_22[0][1]	= 0.0;
 	 cdata_23[0][0]	= 0.0; cdata_23[0][1]	= 0.0;
 	 cdata_33[0][0]	= 0.0; cdata_33[0][1]	= 0.0;*/
-	
-	
-#ifdef SINGLE_PRECISION
-	fftwf_execute(ip11);
-	fftwf_execute(ip12);
-	fftwf_execute(ip13);
-	fftwf_execute(ip22);
-	fftwf_execute(ip23);
-	fftwf_execute(ip33);
-	
-	fftwf_destroy_plan(plan);
-	fftwf_destroy_plan(iplan);
-	fftwf_destroy_plan(ip11);
-	fftwf_destroy_plan(ip12);
-	fftwf_destroy_plan(ip13);
-	fftwf_destroy_plan(ip22);
-	fftwf_destroy_plan(ip23);
-	fftwf_destroy_plan(ip33);
-#else
-	fftw_execute(ip11);
-	fftw_execute(ip12);
-	fftw_execute(ip13);
-	fftw_execute(ip22);
-	fftw_execute(ip23);
-	fftw_execute(ip33);
-	
-	fftw_destroy_plan(plan);
-	fftw_destroy_plan(iplan);
-	fftw_destroy_plan(ip11);
-	fftw_destroy_plan(ip12);
-	fftw_destroy_plan(ip13);
-	fftw_destroy_plan(ip22);
-	fftw_destroy_plan(ip23);
-	fftw_destroy_plan(ip33);
 
-#endif
-//#endif
-	
-	
-#else
-	rfftwnd_plan 
-		plan = rfftw3d_create_plan( nx,ny,nz,
-								   FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
-		iplan = rfftw3d_create_plan( nx,ny,nz,
-									FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
-	
-	
-	#ifndef SINGLETHREAD_FFTW		
-	rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), plan, data, NULL );
-	#else
-	rfftwnd_one_real_to_complex( plan, data, NULL );
-	#endif
-//#endif
-	//double fac = -1.0/(nx*ny*nz);
-	double kfac = 2.0*M_PI;
-	double norm = 1.0/((double)(nx*ny*nz));
-	
-	#pragma omp parallel for
-	for( int i=0; i<(int)nx; ++i )
-		for( size_t j=0; j<ny; ++j )	
-			for( size_t l=0; l<nz/2+1; ++l )
+	FFTW_API(execute)(ip11);
+	FFTW_API(execute)(ip12);
+	FFTW_API(execute)(ip13);
+	FFTW_API(execute)(ip22);
+	FFTW_API(execute)(ip23);
+	FFTW_API(execute)(ip33);
+
+	FFTW_API(destroy_plan)(plan);
+	FFTW_API(destroy_plan)(iplan);
+	FFTW_API(destroy_plan)(ip11);
+	FFTW_API(destroy_plan)(ip12);
+	FFTW_API(destroy_plan)(ip13);
+	FFTW_API(destroy_plan)(ip22);
+	FFTW_API(destroy_plan)(ip23);
+	FFTW_API(destroy_plan)(ip33);
+
+//... copy data ..........................................
+#pragma omp parallel for
+	for (int i = 0; i < (int)nx; ++i)
+		for (size_t j = 0; j < ny; ++j)
+			for (size_t k = 0; k < nz; ++k)
 			{
-				int ii = (int)i; if(ii>(int)(nx/2)) ii-=(int)nx;
-				int jj = (int)j; if(jj>(int)(ny/2)) jj-=(int)ny;
-				double ki = (double)ii;
-				double kj = (double)jj;
-				double kk = (double)l;
-				
-				double k[3];
-				k[0] = (double)ki * kfac;
-				k[1] = (double)kj * kfac;
-				k[2] = (double)kk * kfac;
-				
-				size_t idx = ((size_t)i*ny+j)*nzp/2+l;
-				//double re = cdata[idx].re;
-				//double im = cdata[idx].im;
-				
-				cdata_11[idx].re = -k[0]*k[0] * cdata[idx].re * norm;
-				cdata_11[idx].im = -k[0]*k[0] * cdata[idx].im * norm;
-				
-				cdata_12[idx].re = -k[0]*k[1] * cdata[idx].re * norm;
-				cdata_12[idx].im = -k[0]*k[1] * cdata[idx].im * norm;
-				
-				cdata_13[idx].re = -k[0]*k[2] * cdata[idx].re * norm;
-				cdata_13[idx].im = -k[0]*k[2] * cdata[idx].im * norm;
-				
-				cdata_22[idx].re = -k[1]*k[1] * cdata[idx].re * norm;
-				cdata_22[idx].im = -k[1]*k[1] * cdata[idx].im * norm;
-				
-				cdata_23[idx].re = -k[1]*k[2] * cdata[idx].re * norm;
-				cdata_23[idx].im = -k[1]*k[2] * cdata[idx].im * norm;
-				
-				cdata_33[idx].re = -k[2]*k[2] * cdata[idx].re * norm;
-				cdata_33[idx].im = -k[2]*k[2] * cdata[idx].im * norm;
-				
-				
-				if( i==(int)(nx/2)||j==ny/2||l==nz/2)
-				{
-					cdata_11[idx].re = 0.0;
-					cdata_11[idx].im = 0.0;
-					
-					cdata_12[idx].re = 0.0;
-					cdata_12[idx].im = 0.0;
-					
-					cdata_13[idx].re = 0.0;
-					cdata_13[idx].im = 0.0;
-					
-					cdata_22[idx].re = 0.0;
-					cdata_22[idx].im = 0.0;
-					
-					cdata_23[idx].re = 0.0;
-					cdata_23[idx].im = 0.0;
-					
-					cdata_33[idx].re = 0.0;
-					cdata_33[idx].im = 0.0;
-				}
-				
-			}
-	
-	delete[] data;
-	/*cdata_11[0].re	= 0.0; cdata_11[0].im	= 0.0;
-	cdata_12[0].re	= 0.0; cdata_12[0].im	= 0.0;
-	cdata_13[0].re	= 0.0; cdata_13[0].im	= 0.0;
-	cdata_22[0].re	= 0.0; cdata_22[0].im	= 0.0;
-	cdata_23[0].re	= 0.0; cdata_23[0].im	= 0.0;
-	cdata_33[0].re	= 0.0; cdata_33[0].im	= 0.0;*/
-	
-	
-#ifndef SINGLETHREAD_FFTW		
-	//rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_11, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_12, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_13, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_22, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_23, NULL );
-	rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_33, NULL );
-#else
-	//rfftwnd_one_complex_to_real( iplan, cdata, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_11, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_12, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_13, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_22, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_23, NULL );
-	rfftwnd_one_complex_to_real(iplan, cdata_33, NULL );
-#endif
-	
-	
-	
-	rfftwnd_destroy_plan(plan);
-	rfftwnd_destroy_plan(iplan);
-#endif
+				size_t ii = ((size_t)i * ny + j) * nzp + k;
+				(*fnew.get_grid(u.levelmax()))(i, j, k) = ((data_11[ii] * data_22[ii] - data_12[ii] * data_12[ii]) +
+																									 (data_11[ii] * data_33[ii] - data_13[ii] * data_13[ii]) +
+																									 (data_22[ii] * data_33[ii] - data_23[ii] * data_23[ii]));
 
-
-	//... copy data ..........................................
-	#pragma omp parallel for
-	for( int i=0; i<(int)nx; ++i )
-		for( size_t j=0; j<ny; ++j )	
-			for( size_t k=0; k<nz; ++k )
-			{
-				size_t ii = ((size_t)i*ny+j)*nzp+k;
-				(*fnew.get_grid(u.levelmax()))(i,j,k) = (( data_11[ii]*data_22[ii]-data_12[ii]*data_12[ii] ) +
-														 ( data_11[ii]*data_33[ii]-data_13[ii]*data_13[ii] ) +
-														 ( data_22[ii]*data_33[ii]-data_23[ii]*data_23[ii] ) );
-				
-				//(*fnew.get_grid(u.levelmax()))(i,j,k) = 
-				
+				//(*fnew.get_grid(u.levelmax()))(i,j,k) =
 			}
-	
-	//delete[] data;
+
+	// delete[] data;
 	delete[] data_11;
 	delete[] data_12;
 	delete[] data_13;
@@ -492,131 +306,96 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
 	delete[] data_33;
 }
 
-void compute_2LPT_source( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order )
+void compute_2LPT_source(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
 {
 	fnew = u;
-    fnew.zero();
-	
-	for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel )
+	fnew.zero();
+
+	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0,ilevel), h2 = h*h, h2_4 = 0.25*h2;
+		double h = pow(2.0, ilevel), h2 = h * h, h2_4 = 0.25 * h2;
 		meshvar_bnd *pvar = fnew.get_grid(ilevel);
-        
-		if ( order == 2 )
+
+		if (order == 2)
 		{
-			
-			#pragma omp parallel for
-			for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-			  for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-			    for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
-			      {
-				double D[3][3];
-				
-				D[0][0] = (ACC(ix-2,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+2,iy,iz)) * h2_4;
-				D[1][1] = (ACC(ix,iy-2,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+2,iz)) * h2_4;
-				D[2][2] = (ACC(ix,iy,iz-2)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+2)) * h2_4;
-				
-						
-				D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4;
-				D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4;
-				D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4;
-				
-				(*pvar)(ix,iy,iz) =  ( D[0][0]*D[1][1] - D[0][1]*D[0][1]
-						       + D[0][0]*D[2][2] - D[0][2]*D[0][2]
-						       + D[1][1]*D[2][2] - D[1][2]*D[1][2] );
-						
+
+#pragma omp parallel for
+			for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+				for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+					for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
+					{
+						double D[3][3];
+
+						D[0][0] = (ACC(ix - 2, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 2, iy, iz)) * h2_4;
+						D[1][1] = (ACC(ix, iy - 2, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 2, iz)) * h2_4;
+						D[2][2] = (ACC(ix, iy, iz - 2) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 2)) * h2_4;
+
+						D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
+						D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
+						D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
+
+						(*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - D[0][1] * D[0][1] + D[0][0] * D[2][2] - D[0][2] * D[0][2] + D[1][1] * D[2][2] - D[1][2] * D[1][2]);
 					}
 		}
-		else if ( order == 4 || order == 6 )
+		else if (order == 4 || order == 6)
 		{
 			double h2_144 = h2 / 144.;
-                        #pragma omp parallel for 
-			for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix )
-			  for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy )
-			    for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz )
-			      {
-				//.. this is actually 8th order accurate
-				
-				double D[3][3];
+#pragma omp parallel for
+			for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
+				for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
+					for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
+					{
+						//.. this is actually 8th order accurate
 
-				D[0][0] = ((ACC(ix-4,iy,iz)+ACC(ix+4,iy,iz))
-					   - 16. * (ACC(ix-3,iy,iz)+ACC(ix+3,iy,iz))
-					   + 64. * (ACC(ix-2,iy,iz)+ACC(ix+2,iy,iz))
-					   + 16. * (ACC(ix-1,iy,iz)+ACC(ix+1,iy,iz))
-					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
-				
-				D[1][1] = ((ACC(ix,iy-4,iz)+ACC(ix,iy+4,iz))
-					   - 16. * (ACC(ix,iy-3,iz)+ACC(ix,iy+3,iz))
-					   + 64. * (ACC(ix,iy-2,iz)+ACC(ix,iy+2,iz))
-					   + 16. * (ACC(ix,iy-1,iz)+ACC(ix,iy+1,iz))
-					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
-				
-				D[2][2] = ((ACC(ix,iy,iz-4)+ACC(ix,iy,iz+4))
-					   - 16. * (ACC(ix,iy,iz-3)+ACC(ix,iy,iz+3))
-					   + 64. * (ACC(ix,iy,iz-2)+ACC(ix,iy,iz+2))
-					   + 16. * (ACC(ix,iy,iz-1)+ACC(ix,iy,iz+1))
-					   - 130.*  ACC(ix,iy,iz) ) * h2_144;
-                        
-                        
-				D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))
-						     -8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz)
-							  + ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz))
-						     +1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_144;
-				D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
-						     -8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
-							  + ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
-						     +1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_144;
-				D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
-						     -8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
-							  + ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
-						     +1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_144;
-				
-				(*pvar)(ix,iy,iz) =  ( D[0][0]*D[1][1] - SQR( D[0][1] )
-						       + D[0][0]*D[2][2] - SQR( D[0][2] )
-						       + D[1][1]*D[2][2] - SQR( D[1][2] ) );
-						
-			      }
-			
-			
+						double D[3][3];
+
+						D[0][0] = ((ACC(ix - 4, iy, iz) + ACC(ix + 4, iy, iz)) - 16. * (ACC(ix - 3, iy, iz) + ACC(ix + 3, iy, iz)) + 64. * (ACC(ix - 2, iy, iz) + ACC(ix + 2, iy, iz)) + 16. * (ACC(ix - 1, iy, iz) + ACC(ix + 1, iy, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
+
+						D[1][1] = ((ACC(ix, iy - 4, iz) + ACC(ix, iy + 4, iz)) - 16. * (ACC(ix, iy - 3, iz) + ACC(ix, iy + 3, iz)) + 64. * (ACC(ix, iy - 2, iz) + ACC(ix, iy + 2, iz)) + 16. * (ACC(ix, iy - 1, iz) + ACC(ix, iy + 1, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
+
+						D[2][2] = ((ACC(ix, iy, iz - 4) + ACC(ix, iy, iz + 4)) - 16. * (ACC(ix, iy, iz - 3) + ACC(ix, iy, iz + 3)) + 64. * (ACC(ix, iy, iz - 2) + ACC(ix, iy, iz + 2)) + 16. * (ACC(ix, iy, iz - 1) + ACC(ix, iy, iz + 1)) - 130. * ACC(ix, iy, iz)) * h2_144;
+
+						D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_144;
+						D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_144;
+						D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_144;
+
+						(*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - SQR(D[0][1]) + D[0][0] * D[2][2] - SQR(D[0][2]) + D[1][1] * D[2][2] - SQR(D[1][2]));
+					}
 		}
 		else
 			throw std::runtime_error("compute_2LPT_source : invalid operator order specified");
-
-
 	}
-	
-    	//.. subtract global mean so the multi-grid poisson solver behaves well
-	
-	for( int i=fnew.levelmax(); i>(int)fnew.levelmin(); --i )
-	  mg_straight().restrict( (*fnew.get_grid(i)), (*fnew.get_grid(i-1)) );
-	
+
+	//.. subtract global mean so the multi-grid poisson solver behaves well
+
+	for (int i = fnew.levelmax(); i > (int)fnew.levelmin(); --i)
+		mg_straight().restrict((*fnew.get_grid(i)), (*fnew.get_grid(i - 1)));
+
 	long double sum = 0.0;
-	int nx,ny,nz;
-	
+	int nx, ny, nz;
+
 	nx = fnew.get_grid(fnew.levelmin())->size(0);
 	ny = fnew.get_grid(fnew.levelmin())->size(1);
 	nz = fnew.get_grid(fnew.levelmin())->size(2);
-	
-	for( int ix=0; ix<nx; ++ix )
-	  for( int iy=0; iy<ny; ++iy )
-	    for( int iz=0; iz<nz; ++iz )
-	      sum += (*fnew.get_grid(fnew.levelmin()))(ix,iy,iz);
-	
-	sum /= (double)((size_t)nx*(size_t)ny*(size_t)nz);
-	
-	for( unsigned i=fnew.levelmin(); i<=fnew.levelmax(); ++i )
-	{		
+
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				sum += (*fnew.get_grid(fnew.levelmin()))(ix, iy, iz);
+
+	sum /= (double)((size_t)nx * (size_t)ny * (size_t)nz);
+
+	for (unsigned i = fnew.levelmin(); i <= fnew.levelmax(); ++i)
+	{
 		nx = fnew.get_grid(i)->size(0);
 		ny = fnew.get_grid(i)->size(1);
 		nz = fnew.get_grid(i)->size(2);
-		
-		for( int ix=0; ix<nx; ++ix )
-		  for( int iy=0; iy<ny; ++iy )
-		    for( int iz=0; iz<nz; ++iz )
-		      (*fnew.get_grid(i))(ix,iy,iz) -= sum;
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+					(*fnew.get_grid(i))(ix, iy, iz) -= sum;
 	}
-	
 }
 #undef SQR
 #undef ACC
-
diff --git a/src/densities.cc b/src/densities.cc
index 57651c4..74b1cf6 100644
--- a/src/densities.cc
+++ b/src/densities.cc
@@ -38,28 +38,15 @@ void fft_coarsen(m1 &v, m2 &V)
 	size_t nxf = v.size(0), nyf = v.size(1), nzf = v.size(2), nzfp = nzf + 2;
 	size_t nxF = V.size(0), nyF = V.size(1), nzF = V.size(2), nzFp = nzF + 2;
 
-	fftw_real *rcoarse = new fftw_real[nxF * nyF * nzFp];
-	fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+	real_t *rcoarse = new real_t[nxF * nyF * nzFp];
+	complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
 
-	fftw_real *rfine = new fftw_real[nxf * nyf * nzfp];
-	fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+	real_t *rfine = new real_t[nxf * nyf * nzfp];
+	complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan
-		pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-		ipc = fftwf_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
-#else
-	fftw_plan
-		pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-		ipc = fftw_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
-#endif
-
-#else
-	rfftwnd_plan
-		pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-		ipc = rfftw3d_create_plan(nxF, nyF, nzF, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
+	fftw_plan_t
+		pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
+		ipc = FFTW_API(plan_dft_c2r_3d)(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
 
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxf; i++)
@@ -70,19 +57,7 @@ void fft_coarsen(m1 &v, m2 &V)
 				rfine[q] = v(i, j, k);
 			}
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(pf);
-#else
-	fftw_execute(pf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
-#else
-	rfftwnd_one_real_to_complex(pf, rfine, NULL);
-#endif
-#endif
+	FFTW_API(execute)(pf);
 
 	double fftnorm = 1.0 / ((double)nxF * (double)nyF * (double)nzF);
 
@@ -125,19 +100,7 @@ void fft_coarsen(m1 &v, m2 &V)
 
 	delete[] rfine;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(ipc);
-#else
-	fftw_execute(ipc);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
-#else
-	rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
-#endif
-#endif
+	FFTW_API(execute)(ipc);
 
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxF; i++)
@@ -150,18 +113,8 @@ void fft_coarsen(m1 &v, m2 &V)
 
 	delete[] rcoarse;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_destroy_plan(pf);
-	fftwf_destroy_plan(ipc);
-#else
-	fftw_destroy_plan(pf);
-	fftw_destroy_plan(ipc);
-#endif
-#else
-	rfftwnd_destroy_plan(pf);
-	rfftwnd_destroy_plan(ipc);
-#endif
+	FFTW_API(destroy_plan)(pf);
+	FFTW_API(destroy_plan)(ipc);
 }
 
 template <typename m1, typename m2>
@@ -191,14 +144,14 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 
 	size_t nxc = nxf / 2, nyc = nyf / 2, nzc = nzf / 2, nzcp = nzf / 2 + 2;
 
-	fftw_real *rcoarse = new fftw_real[nxc * nyc * nzcp];
-	fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+	real_t *rcoarse = new real_t[nxc * nyc * nzcp];
+	complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
 
-	fftw_real *rfine = new fftw_real[nxf * nyf * nzfp];
-	fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+	real_t *rfine = new real_t[nxf * nyf * nzfp];
+	complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
 
 	// copy coarse data to rcoarse[.]
-	memset(rcoarse, 0, sizeof(fftw_real) * nxc * nyc * nzcp);
+	memset(rcoarse, 0, sizeof(real_t) * nxc * nyc * nzcp);
 
 #pragma omp parallel for
 	for (int i = 0; i < (int)nxc; ++i)
@@ -221,36 +174,13 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 				rfine[q] = v(i, j, k);
 			}
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan
-		pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
-		pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-		ipf = fftwf_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
-	fftwf_execute(pc);
-	fftwf_execute(pf);
-#else
-	fftw_plan
-		pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
-		pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
-		ipf = fftw_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
-	fftw_execute(pc);
-	fftw_execute(pf);
-#endif
-#else
-	rfftwnd_plan
-		pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-		pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-		ipf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
 
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL);
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
-#else
-	rfftwnd_one_real_to_complex(pc, rcoarse, NULL);
-	rfftwnd_one_real_to_complex(pf, rfine, NULL);
-#endif
-#endif
+	fftw_plan_t
+		pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
+		pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
+		ipf = FFTW_API(plan_dft_c2r_3d)(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
+	FFTW_API(execute)(pc);
+	FFTW_API(execute)(pf);
 
 	/*************************************************/
 	//.. perform actual interpolation
@@ -300,28 +230,11 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
 
 	/*************************************************/
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(ipf);
-	fftwf_destroy_plan(pf);
-	fftwf_destroy_plan(pc);
-	fftwf_destroy_plan(ipf);
-#else
-	fftw_execute(ipf);
-	fftw_destroy_plan(pf);
-	fftw_destroy_plan(pc);
-	fftw_destroy_plan(ipf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
-#else
-	rfftwnd_one_complex_to_real(ipf, cfine, NULL);
-#endif
-	fftwnd_destroy_plan(pf);
-	fftwnd_destroy_plan(pc);
-	fftwnd_destroy_plan(ipf);
-#endif
+	FFTW_API(execute)(ipf);
+
+	FFTW_API(destroy_plan)(pf);
+	FFTW_API(destroy_plan)(pc);
+	FFTW_API(destroy_plan)(ipf);
 
 // copy back and normalize
 #pragma omp parallel for
@@ -349,8 +262,6 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	levelmin = cf.get_value_safe<unsigned>("setup", "levelmin_TF", levelminPoisson);
 	levelmax = cf.get_value<unsigned>("setup", "levelmax");
 
-	bool kspace = cf.get_value<bool>("setup", "kspace_TF");
-
 	bool fix  = cf.get_value_safe<bool>("setup","fix_mode_amplitude",false);
 	bool flip = cf.get_value_safe<bool>("setup","flip_mode_amplitude",false);
 
@@ -360,30 +271,10 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	music::ulog.Print("Running unigrid density convolution...");
 
 	//... select the transfer function to be used
-	convolution::kernel_creator *the_kernel_creator;
+	convolution::kernel_creator *the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k"];
 
-	if (kspace)
-	{
-		std::cout << " - Using k-space transfer function kernel.\n";
-		music::ulog.Print("Using k-space transfer function kernel.");
-
-#ifdef SINGLE_PRECISION
-		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
-#else
-		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
-#endif
-	}
-	else
-	{
-		std::cout << " - Using real-space transfer function kernel.\n";
-		music::ulog.Print("Using real-space transfer function kernel.");
-
-#ifdef SINGLE_PRECISION
-		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_float"];
-#else
-		the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_double"];
-#endif
-	}
+	std::cout << " - Using k-space transfer function kernel.\n";
+	music::ulog.Print("Using k-space transfer function kernel.");
 
 	//... initialize convolution kernel
 	convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
@@ -402,7 +293,7 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
 	the_tf_kernel->fetch_kernel(levelmin, false);
 
 	//... perform convolution
-	convolution::perform<real_t>(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
+	convolution::perform(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
 
 	//... clean up kernel
 	delete the_tf_kernel;
@@ -451,17 +342,11 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 
 	unsigned nbase = 1 << levelmin;
 
-	convolution::kernel_creator *the_kernel_creator;
+	convolution::kernel_creator *the_kernel_creator  = convolution::get_kernel_map()["tf_kernel_k"];
 
 	std::cout << " - Using k-space transfer function kernel.\n";
 	music::ulog.Print("Using k-space transfer function kernel.");
 
-#ifdef SINGLE_PRECISION
-	the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
-#else
-	the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
-#endif
-
 	convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
 
 	/***** PERFORM CONVOLUTIONS *****/
@@ -475,7 +360,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 		top = new DensityGrid<real_t>(nbase, nbase, nbase);
 		music::ilog.Print("Performing noise convolution on level %3d", levelmin);
 		rand.load(*top, levelmin);
-		convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
+		convolution::perform(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
 
 		delta.create_base_hierarchy(levelmin);
 		top->copy(*delta.get_grid(levelmin));
@@ -506,7 +391,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
 			// load white noise for patch
 			rand.load(*fine, levelmin + i);
 
-			convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin + i, true),
+			convolution::perform(the_tf_kernel->fetch_kernel(levelmin + i, true),
 										 reinterpret_cast<void *>(fine->get_data_ptr()), shift, fix, flip);
 
 			if( fourier_splicing ){
diff --git a/src/fd_schemes.hh b/src/fd_schemes.hh
index fa4be40..1ad2dd9 100644
--- a/src/fd_schemes.hh
+++ b/src/fd_schemes.hh
@@ -11,12 +11,13 @@
 #ifndef __FD_SCHEMES_HH
 #define __FD_SCHEMES_HH
 
+#include <general.hh>
 #include <vector>
 #include <stdexcept>
 
 
 //! abstract implementation of the Poisson/Force scheme
-template< class L, class G, typename real_t=double >
+template< class L, class G>
 class scheme
 {
 public:
@@ -57,10 +58,9 @@ public:
 };
 
 //! base class for finite difference gradients
-template< int nextent, typename T >
+template< int nextent>
 class gradient
 {
-	typedef T real_t;
 	std::vector<real_t> m_stencil;
 	const unsigned nl;
 public:
@@ -110,20 +110,21 @@ public:
 };
 
 //! base class for finite difference stencils
-template< int nextent, typename real_t >
+template< int nextent>
 class base_stencil
 {
 protected:
-	std::vector<real_t> m_stencil;
-	const unsigned nl;
+	static constexpr size_t nl{2*nextent+1};
+	std::array<real_t,nl*nl*nl> m_stencil;
+
 public:
 	bool m_modsource;
 	
 public:
-	base_stencil( bool amodsource = false )
-	: nl( 2*nextent+1 ), m_modsource( amodsource )
+	explicit base_stencil( bool amodsource = false )
+	: m_modsource( amodsource )
 	{
-		m_stencil.assign(nl*nl*nl,(real_t)0.0);
+		m_stencil.fill( (real_t)0.0 );
 	}
 	
 	real_t& operator()(int i, int j, int k)
@@ -176,8 +177,7 @@ public:
 //... Implementation of the Gradient schemes............................................
 
 
-template< typename real_t >
-class deriv_2P : public gradient<1,real_t>
+class deriv_2P : public gradient<1>
 {
 	
 public:
@@ -194,8 +194,7 @@ public:
 //... Implementation of the Laplacian schemes..........................................
 
 //! 7-point, 2nd order finite difference Laplacian
-template< typename real_t >
-class stencil_7P : public base_stencil<1,real_t>
+class stencil_7P : public base_stencil<1>
 {
 	
 public:
@@ -214,7 +213,7 @@ public:
 	inline real_t apply( const C& c, const int i, const int j, const int k ) const
 	{
 		//return c(i-1,j,k)+c(i+1,j,k)+c(i,j-1,k)+c(i,j+1,k)+c(i,j,k-1)+c(i,j,k+1)-6.0*c(i,j,k);
-		return (double)c(i-1,j,k)+(double)c(i+1,j,k)+(double)c(i,j-1,k)+(double)c(i,j+1,k)+(double)c(i,j,k-1)+(double)c(i,j,k+1)-6.0*(double)c(i,j,k);
+		return (real_t)c(i-1,j,k)+(real_t)c(i+1,j,k)+(real_t)c(i,j-1,k)+(real_t)c(i,j+1,k)+(real_t)c(i,j,k-1)+(real_t)c(i,j,k+1)-6.0*(real_t)c(i,j,k);
 	}
 	
 	template< class C >
@@ -230,8 +229,7 @@ public:
 };
 
 //! 13-point, 4th order finite difference Laplacian
-template< typename real_t >
-class stencil_13P : public base_stencil<2,real_t>
+class stencil_13P : public base_stencil<2>
 {
 	
 public:
@@ -279,8 +277,7 @@ public:
 
 
 //! 19-point, 6th order finite difference Laplacian
-template< typename real_t >
-class stencil_19P : public base_stencil<3,real_t>
+class stencil_19P : public base_stencil<3>
 {
 	
 public:
@@ -339,7 +336,6 @@ public:
 
 
 //! flux operator for the 4th order FD Laplacian
-template< typename real_t >
 class Laplace_flux_O4
 {
 public:
@@ -354,7 +350,7 @@ public:
 	template< class C >
 	inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i-2,j,k)+15.0*c(i-1,j,k)-15.0*c(i,j,k)+c(i+1,j,k));
 	}
 	
@@ -369,7 +365,7 @@ public:
 	template< class C >
 	inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i,j-2,k)+15.0*c(i,j-1,k)-15.0*c(i,j,k)+c(i,j+1,k));
 	}
 	
@@ -384,7 +380,7 @@ public:
 	template< class C >
 	inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/12.0;
+		double fac = -((real_t)idir)/12.0;
 		return fac*(-c(i,j,k-2)+15.0*c(i,j,k-1)-15.0*c(i,j,k)+c(i,j,k+1));
 	}
 	
@@ -392,7 +388,6 @@ public:
 
 
 //! flux operator for the 6th order FD Laplacian
-template< typename real_t >
 class Laplace_flux_O6
 {
 public:
@@ -408,7 +403,7 @@ public:
 	template< class C >
 	inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i-3,j,k)-25.*c(i-2,j,k)+245.*c(i-1,j,k)-245.0*c(i,j,k)+25.*c(i+1,j,k)-2.*c(i+2,j,k));
 	}
 	
@@ -423,7 +418,7 @@ public:
 	template< class C >
 	inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i,j-3,k)-25.*c(i,j-2,k)+245.*c(i,j-1,k)-245.0*c(i,j,k)+25.*c(i,j+1,k)-2.*c(i,j+2,k));
 	}
 	
@@ -438,7 +433,7 @@ public:
 	template< class C >
 	inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
 	{
-		double fac = -((double)idir)/180.0;
+		real_t fac = -((real_t)idir)/180.0;
 		return fac*(2.*c(i,j,k-3)-25.*c(i,j,k-2)+245.*c(i,j,k-1)-245.0*c(i,j,k)+25.*c(i,j,k+1)-2.*c(i,j,k+2));
 	}
 	
diff --git a/src/general.hh b/src/general.hh
index 789c35c..104cae6 100644
--- a/src/general.hh
+++ b/src/general.hh
@@ -8,75 +8,56 @@
  
 */
 
-#ifndef __GENERAL_HH
-#define __GENERAL_HH
+#pragma once
 
-#include "logger.hh"
+#include <logger.hh>
+#include <config_file.hh>
 
 #include <cassert>
-#include "omp.h"
+#include <omp.h>
 
-#ifdef WITH_MPI
-  #ifdef MANNO
-    #include <mpi.h>
-  #else
-    #include <mpi++.h>
-  #endif
-#else
-#include <time.h>
+#include <fftw3.h>
+
+// include CMake controlled configuration settings
+#include "cmake_config.hh"
+
+#if defined(USE_PRECISION_FLOAT)
+  using real_t = float;
+  using complex_t = fftwf_complex;
+  #define FFTW_PREFIX fftwf
+#elif defined(USE_PRECISION_DOUBLE)
+  using real_t = double;
+  using complex_t = fftw_complex;
+  #define FFTW_PREFIX fftw
+#elif defined(USE_PRECISION_LONGDOUBLE)
+  using real_t = long double;
+  using complex_t = fftwl_complex;
+  #define FFTW_PREFIX fftwl
 #endif
 
-#ifdef FFTW3
-	#include <fftw3.h>
-	#if defined(SINGLE_PRECISION)
-	typedef float fftw_real;
-	#else
-	typedef double fftw_real;
-	#endif
+#define FFTW_GEN_NAME_PRIM(a, b) a##_##b
+#define FFTW_GEN_NAME(a, b) FFTW_GEN_NAME_PRIM(a, b)
+#define FFTW_API(x) FFTW_GEN_NAME(FFTW_PREFIX, x)
 
-#else
-	#if defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
-	#include <srfftw.h>
-	#include <srfftw_threads.h>
-	#elif defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
-	#include <srfftw.h>
-	#elif not defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
-	#include <drfftw.h>
-	#include <drfftw_threads.h>
-	#elif not defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
-	#include <drfftw.h>
-	#endif
-#endif
+using fftw_plan_t = FFTW_GEN_NAME(FFTW_PREFIX, plan);
 
-#ifdef SINGLE_PRECISION
-	typedef float real_t;
-#else
-	typedef double real_t;
-#endif
+#define RE(x) ((x)[0])
+#define IM(x) ((x)[1])
 
+#include <vector>
 #include <array>
 using vec3_t = std::array<real_t,3>;
 
-#ifdef FFTW3
-	#define RE(x) ((x)[0])
-	#define IM(x) ((x)[1])
-#else
-	#define RE(x) ((x).re)
-	#define IM(x) ((x).im)
-#endif
-
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
-#define fftw_complex fftwf_complex
-#endif
-
-
-
-#include <vector>
-
-#include "config_file.hh"
-//#include "mesh.hh"
-
-
+namespace CONFIG
+{
+// extern int MPI_thread_support;
+// extern int MPI_task_rank;
+// extern int MPI_task_size;
+// extern bool MPI_ok;
+// extern bool MPI_threads_ok;
+extern bool FFTW_threads_ok;
+extern int num_threads;
+} // namespace CONFIG
 
 //! compute square of argument
 template< typename T >
@@ -180,6 +161,3 @@ inline bool is_number(const std::string& s)
 	
 	return true;
 }
-
-
-#endif
diff --git a/src/main.cc b/src/main.cc
index ca316a8..b18fced 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -13,6 +13,8 @@
 #include <iomanip>
 #include <math.h>
 
+#include <thread>
+
 #include <gsl/gsl_rng.h>
 #include <gsl/gsl_randist.h>
 #include <gsl/gsl_integration.h>
@@ -26,25 +28,40 @@ extern "C"
 }
 #endif
 
-#include "general.hh"
-#include "defaults.hh"
-#include "output.hh"
+#include <exception>
+#include <cfenv>
 
-#include "config_file.hh"
+#include <general.hh>
+#include <defaults.hh>
+#include <output.hh>
 
-#include "poisson.hh"
-#include "mg_solver.hh"
-#include "fd_schemes.hh"
-#include "random.hh"
-#include "densities.hh"
+#include <config_file.hh>
 
-#include "convolution_kernel.hh"
-#include "cosmology.hh"
-#include "transfer_function.hh"
+#include <poisson.hh>
+#include <mg_solver.hh>
+#include <fd_schemes.hh>
+#include <random.hh>
+#include <densities.hh>
+
+#include <convolution_kernel.hh>
+#include <cosmology.hh>
+#include <transfer_function.hh>
 
 #define THE_CODE_NAME "music!"
 #define THE_CODE_VERSION "2.0a"
 
+// initialise with "default" values
+namespace CONFIG{
+// int  MPI_thread_support = -1;
+// int  MPI_task_rank = 0;
+// int  MPI_task_size = 1;
+// bool MPI_ok = false;
+// bool MPI_threads_ok = false;
+bool FFTW_threads_ok = false;
+int  num_threads = 1;
+}
+
+
 namespace music
 {
 
@@ -87,11 +104,6 @@ void splash(void)
 
 #if defined(CMAKE_BUILD)
 	music::ilog.Print("Version built from git rev.: %s, tag: %s, branch: %s", GIT_REV, GIT_TAG, GIT_BRANCH);
-#endif
-#if defined(SINGLE_PRECISION)
-	music::ilog.Print("Version was compiled for single precision.");
-#else
-	music::ilog.Print("Version was compiled for double precision.");
 #endif
 	std::cout << "\n\n";
 }
@@ -294,6 +306,50 @@ void add_constant_value( grid_hierarchy &u, const double val )
 	}
 }
 
+#include <system_stat.hh>
+void output_system_info()
+{
+	std::feclearexcept(FE_ALL_EXCEPT);
+
+	//------------------------------------------------------------------------------
+	// Write code configuration to screen
+	//------------------------------------------------------------------------------
+	// hardware related infos
+	music::ilog << std::setw(32) << std::left << "CPU vendor string" << " : " << SystemStat::Cpu().get_CPUstring() << std::endl;
+	
+	// multi-threading related infos
+	music::ilog << std::setw(32) << std::left << "Available HW threads / task" << " : " << std::thread::hardware_concurrency() << " (" << CONFIG::num_threads << " used)" << std::endl;
+
+	// memory related infos
+	SystemStat::Memory mem;
+
+	unsigned availpmem = mem.get_AvailMem()/1024/1024;
+	unsigned usedpmem = mem.get_UsedMem()/1024/1024;
+	unsigned maxpmem = availpmem, minpmem = availpmem;
+	unsigned maxupmem = usedpmem, minupmem = usedpmem;
+	
+	music::ilog << std::setw(32) << std::left << "Total system memory (phys)" << " : " << mem.get_TotalMem()/1024/1024 << " Mb" << std::endl;
+	music::ilog << std::setw(32) << std::left << "Used system memory (phys)" << " : " << "Max: " << maxupmem << " Mb, Min: " << minupmem << " Mb" << std::endl;
+	music::ilog << std::setw(32) << std::left << "Available system memory (phys)" << " : " <<  "Max: " << maxpmem << " Mb, Min: " << minpmem << " Mb" << std::endl;
+			
+	// Kernel related infos
+	SystemStat::Kernel kern;
+	auto kinfo = kern.get_kernel_info();
+	music::ilog << std::setw(32) << std::left << "OS/Kernel version" << " : " << kinfo.kernel << " version " << kinfo.major << "." << kinfo.minor << " build " << kinfo.build_number << std::endl;
+
+	// FFTW related infos
+	music::ilog << std::setw(32) << std::left << "FFTW version" << " : " << FFTW_API(version) << std::endl;
+	music::ilog << std::setw(32) << std::left << "FFTW supports multi-threading" << " : " << (CONFIG::FFTW_threads_ok? "yes" : "no") << std::endl;
+	music::ilog << std::setw(32) << std::left << "FFTW mode" << " : ";
+#if defined(FFTW_MODE_PATIENT)
+	music::ilog << "FFTW_PATIENT" << std::endl;
+#elif defined(FFTW_MODE_MEASURE)
+    music::ilog << "FFTW_MEASURE" << std::endl;
+#else
+	music::ilog << "FFTW_ESTIMATE" << std::endl;
+#endif
+}
+
 /*****************************************************************************************************/
 /*****************************************************************************************************/
 /*****************************************************************************************************/
@@ -342,25 +398,6 @@ int main(int argc, const char *argv[])
 	music::ulog.Print("Running %s, version %s", THE_CODE_NAME, THE_CODE_VERSION);
 	music::ulog.Print("Log is for run started %s", asctime(localtime(&ltime)));
 
-#ifdef FFTW3
-	music::ulog.Print("Code was compiled using FFTW version 3.x");
-#else
-	music::ulog.Print("Code was compiled using FFTW version 2.x");
-#endif
-
-#ifdef SINGLETHREAD_FFTW
-	music::ulog.Print("Code was compiled for single-threaded FFTW");
-#else
-	music::ulog.Print("Code was compiled for multi-threaded FFTW");
-	music::ulog.Print("Running with a maximum of %d OpenMP threads", omp_get_max_threads());
-#endif
-
-#ifdef SINGLE_PRECISION
-	music::ulog.Print("Code was compiled for single precision.");
-#else
-	music::ulog.Print("Code was compiled for double precision.");
-#endif
-
 	//------------------------------------------------------------------------------
 	//... read and interpret config file
 	//------------------------------------------------------------------------------
@@ -369,6 +406,13 @@ int main(int argc, const char *argv[])
 	bool force_shift(false);
 	double boxlength;
 
+
+	//------------------------------------------------------------------------------
+	//... init multi-threading
+	//------------------------------------------------------------------------------
+	CONFIG::FFTW_threads_ok = FFTW_API(init_threads)();
+	CONFIG::num_threads = cf.get_value_safe<unsigned>("execution", "NumThreads",std::thread::hardware_concurrency());
+  
 	//------------------------------------------------------------------------------
 	//... initialize some parameters about grid set-up
 	//------------------------------------------------------------------------------
@@ -403,24 +447,6 @@ int main(int argc, const char *argv[])
 	else
 		music::ilog.Print("Using real space sampled transfer functions...");
 
-		//------------------------------------------------------------------------------
-		//... initialize multithread FFTW
-		//------------------------------------------------------------------------------
-
-#if not defined(SINGLETHREAD_FFTW)
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_init_threads();
-	fftwf_plan_with_nthreads(omp_get_max_threads());
-#else
-	fftw_init_threads();
-	fftw_plan_with_nthreads(omp_get_max_threads());
-#endif
-#else
-	fftw_threads_init();
-#endif
-#endif
-
 	//------------------------------------------------------------------------------
 	//... initialize cosmology
 	//------------------------------------------------------------------------------
@@ -1373,13 +1399,8 @@ int main(int argc, const char *argv[])
 	delete the_transfer_function_plugin;
 	delete the_poisson_solver;
 
-#if defined(FFTW3) and not defined(SINGLETHREAD_FFTW)
-#ifdef SINGLE_PRECISION
-	fftwf_cleanup_threads();
-#else
-	fftw_cleanup_threads();
-#endif
-#endif
+	if( CONFIG::FFTW_threads_ok )
+		FFTW_API(cleanup_threads)();
 
 	//------------------------------------------------------------------------------
 	//... we are done !
diff --git a/src/mg_interp.hh b/src/mg_interp.hh
index c90115c..7a601da 100644
--- a/src/mg_interp.hh
+++ b/src/mg_interp.hh
@@ -290,12 +290,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 							
 								dflux = coarse_flux - fine_flux;
@@ -312,12 +312,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								
 								dflux = coarse_flux - fine_flux;
@@ -338,12 +338,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 								fine_flux /= 4.0;
 								
 								dflux = coarse_flux - fine_flux;
@@ -359,12 +359,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								
 								dflux = coarse_flux - fine_flux;
@@ -384,12 +384,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
+								fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 								fine_flux /= 4.0;
 								
 								dflux = coarse_flux - fine_flux;
@@ -405,12 +405,12 @@ struct cubic_interp
 							{
 								
 								fine_flux = 0.0;
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz);
-								fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
+								fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
 								
-								coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+								coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 								fine_flux /= 4.0;
 								
 								dflux = coarse_flux - fine_flux;
@@ -717,13 +717,13 @@ struct interp_O5_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
 							fine_flux /= 4.0;
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 							
 							dflux = coarse_flux - fine_flux;
 							
@@ -758,12 +758,12 @@ struct interp_O5_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -798,12 +798,12 @@ struct interp_O5_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -838,12 +838,12 @@ struct interp_O5_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -880,12 +880,12 @@ struct interp_O5_fluxcorr
 
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -920,12 +920,12 @@ struct interp_O5_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
 							
-							coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -1027,13 +1027,13 @@ struct interp_O7_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz+1);
 							fine_flux /= 4.0;
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
 							
 							dflux = coarse_flux - fine_flux;
 							
@@ -1074,12 +1074,12 @@ struct interp_O7_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -1119,12 +1119,12 @@ struct interp_O7_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -1164,12 +1164,12 @@ struct interp_O7_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz+1);
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -1210,12 +1210,12 @@ struct interp_O7_fluxcorr
 							
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy+1,iz+1);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy+1,iz+1);
+							fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy+1,iz+1);
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
@@ -1255,12 +1255,12 @@ struct interp_O7_fluxcorr
 								}
 							
 							fine_flux = 0.0;
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy+1,iz);
-							fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy+1,iz);
+							fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy+1,iz);
 							
-							coarse_flux = Laplace_flux_O6<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
+							coarse_flux = Laplace_flux_O6().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
 							fine_flux /= 4.0;
 							
 							dflux = coarse_flux - fine_flux;
diff --git a/src/mg_solver.hh b/src/mg_solver.hh
index b2d53d2..c39cd23 100644
--- a/src/mg_solver.hh
+++ b/src/mg_solver.hh
@@ -1,37 +1,43 @@
 /*
- 
+
  mg_solver.hh - This file is part of MUSIC -
- a code to generate multi-scale initial conditions 
- for cosmological simulations 
- 
+ a code to generate multi-scale initial conditions
+ for cosmological simulations
+
  Copyright (C) 2010  Oliver Hahn
- 
+
 */
 
-#ifndef __MG_SOLVER_HH
-#define __MG_SOLVER_HH
+#pragma once
 
 #include <cmath>
 #include <iostream>
 
-#include "mg_operators.hh"
-#include "mg_interp.hh"
+#include <mg_operators.hh>
+#include <mg_interp.hh>
 
-#include "mesh.hh"
+#include <mesh.hh>
 
-#define BEGIN_MULTIGRID_NAMESPACE namespace multigrid {
+#define BEGIN_MULTIGRID_NAMESPACE \
+	namespace multigrid             \
+	{
 #define END_MULTIGRID_NAMESPACE }
 
 BEGIN_MULTIGRID_NAMESPACE
-	
+
 //! options for multigrid smoothing operation
-namespace opt {
-	enum smtype { sm_jacobi, sm_gauss_seidel, sm_sor };
+namespace opt
+{
+	enum smtype
+	{
+		sm_jacobi,
+		sm_gauss_seidel,
+		sm_sor
+	};
 }
 
-
 //! actual implementation of FAS adaptive multigrid solver
-template< class S, class I, class O, typename T=double >
+template <class S, class I, class O>
 class solver
 {
 public:
@@ -40,229 +46,214 @@ public:
 	typedef I interp;
 
 protected:
-	scheme				m_scheme;				//!< finite difference scheme
-	mgop				m_gridop;				//!< grid prolongation and restriction operator
-	unsigned			m_npresmooth,			//!< number of pre sweeps
-						m_npostsmooth;			//!< number of post sweeps
-	opt::smtype			m_smoother;				//!< smoothing method to be applied
-	unsigned			m_ilevelmin;			//!< index of the top grid level
-	
-	const static bool	m_bperiodic = true;		//!< flag whether top grid is periodic
-	
-	std::vector<double> m_residu_ini;			//!< vector of initial residuals for each level
-	bool m_is_ini;								//!< bool that is true for first iteration
+	scheme m_scheme;				//!< finite difference scheme
+	mgop m_gridop;					//!< grid prolongation and restriction operator
+	unsigned m_npresmooth,	//!< number of pre sweeps
+			m_npostsmooth;			//!< number of post sweeps
+	opt::smtype m_smoother; //!< smoothing method to be applied
+	unsigned m_ilevelmin;		//!< index of the top grid level
+
+	const static bool m_bperiodic = true; //!< flag whether top grid is periodic
+
+	std::vector<double> m_residu_ini; //!< vector of initial residuals for each level
+	bool m_is_ini;										//!< bool that is true for first iteration
+
+	GridHierarchy<real_t>
+			*m_pu,		 //!< pointer to GridHierarchy for solution u
+			*m_pf,		 //!< pointer to GridHierarchy for right-hand-side
+			*m_pfsave; //!< pointer to saved state of right-hand-side (unused)
+
+	const MeshvarBnd<real_t> *m_pubnd;
 
-	GridHierarchy<T>	*m_pu,					//!< pointer to GridHierarchy for solution u
-						*m_pf,					//!< pointer to GridHierarchy for right-hand-side
-						*m_pfsave;				//!< pointer to saved state of right-hand-side (unused)
-	
-	const MeshvarBnd<T> *m_pubnd;
-	
 	//! compute residual for a level
-  double compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& unew, int ilevel );
-	
+	double compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &unew, int ilevel);
+
 	//! compute residuals for entire grid hierarchy
-	double compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& uhnew, bool verbose );
-	
+	double compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &uhnew, bool verbose);
+
 	//! compute residuals for entire grid hierarchy
-	double compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose );
+	double compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose);
 
 protected:
-	
-	//! Jacobi smoothing 
-	void Jacobi( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
+	//! Jacobi smoothing
+	void Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
 	//! Gauss-Seidel smoothing
-	void GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
+	void GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
 	//! Successive-Overrelaxation smoothing
-	void SOR( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
+	void SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
 	//! main two-grid (V-cycle) for multi-grid iterations
-	void twoGrid( unsigned ilevel );
-	
+	void twoGrid(unsigned ilevel);
+
 	//! apply boundary conditions
-	void setBC( unsigned ilevel );
-	
+	void setBC(unsigned ilevel);
+
 	//! make top grid periodic boundary conditions
-	void make_periodic( MeshvarBnd<T> *u );
-	
-	//void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine );
-		
+	void make_periodic(MeshvarBnd<real_t> *u);
+
+	// void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<real_t>& coarse, MeshvarBnd<real_t>& fine );
+
 public:
-	
 	//! constructor
-	solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth );
-	
+	solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth);
+
 	//! destructor
 	~solver()
-	{  }
-	
-	//! solve Poisson's equation 
-	double solve( GridHierarchy<T>& u, double accuracy, double h=-1.0, bool verbose=false );
-	
-	//! solve Poisson's equation 
-	double solve( GridHierarchy<T>& u, double accuracy, bool verbose=false )
 	{
-		return this->solve ( u, accuracy, -1.0, verbose );
 	}
-	
-	
-	
+
+	//! solve Poisson's equation
+	double solve(GridHierarchy<real_t> &u, double accuracy, double h = -1.0, bool verbose = false);
+
+	//! solve Poisson's equation
+	double solve(GridHierarchy<real_t> &u, double accuracy, bool verbose = false)
+	{
+		return this->solve(u, accuracy, -1.0, verbose);
+	}
 };
 
-
-template< class S, class I, class O, typename T >
-solver<S,I,O,T>::solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth )
-:	m_scheme(), m_gridop(), m_npresmooth( npresmooth ), m_npostsmooth( npostsmooth ), 
-m_smoother( smoother ), m_ilevelmin( f.levelmin() ), m_is_ini( true ), m_pf( &f )
-{ 
+template <class S, class I, class O>
+solver<S, I, O>::solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth)
+		: m_scheme(), m_gridop(), m_npresmooth(npresmooth), m_npostsmooth(npostsmooth),
+			m_smoother(smoother), m_ilevelmin(f.levelmin()), m_is_ini(true), m_pf(&f)
+{
 	m_is_ini = true;
 }
 
-
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
+template <class S, class I, class O>
+void solver<S, I, O>::Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-	
-	double 
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
-	MeshvarBnd<T> uold(*u);
-	
-	double alpha = 0.95, ialpha = 1.0-alpha;
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
+	double
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	MeshvarBnd<real_t> uold(*u);
+
+	double alpha = 0.95, ialpha = 1.0 - alpha;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
+template <class S, class I, class O>
+void solver<S, I, O>::SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
 
-	double 
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-		
-	MeshvarBnd<T> uold(*u);
-	
-	double 
-		alpha = 1.2, 
-	//alpha = 2 / (1 + 4 * atan(1.0) / double(u->size(0)))-1.0, //.. ideal alpha
-		ialpha = 1.0-alpha;
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				if( (ix+iy+iz)%2==0 )
-					(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				if( (ix+iy+iz)%2!=0 )
-					(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
-	
-	
+	double
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	MeshvarBnd<real_t> uold(*u);
+
+	double
+			alpha = 1.2,
+			// alpha = 2 / (1 + 4 * atan(1.0) / double(u->size(0)))-1.0, //.. ideal alpha
+			ialpha = 1.0 - alpha;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if ((ix + iy + iz) % 2 == 0)
+					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if ((ix + iy + iz) % 2 != 0)
+					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f )
+template <class S, class I, class O>
+void solver<S, I, O>::GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
-	int 
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-	
-	T
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
-	for( int color=0; color < 2; ++color )
-		#pragma omp parallel for
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
-					if( (ix+iy+iz)%2 == color )
-						(*u)(ix,iy,iz) = (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
+	int
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
+	real_t
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	for (int color = 0; color < 2; ++color)
+#pragma omp parallel for
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+					if ((ix + iy + iz) % 2 == color)
+						(*u)(ix, iy, iz) = (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::twoGrid( unsigned ilevel )
+template <class S, class I, class O>
+void solver<S, I, O>::twoGrid(unsigned ilevel)
 {
-	MeshvarBnd<T> *uf, *uc, *ff, *fc;
-	
-	
-	double 
-		h = 1.0/(1<<ilevel),
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
+	MeshvarBnd<real_t> *uf, *uc, *ff, *fc;
+
+	double
+			h = 1.0 / (1 << ilevel),
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
 	uf = m_pu->get_grid(ilevel);
-	ff = m_pf->get_grid(ilevel);	
-	
-	uc = m_pu->get_grid(ilevel-1);
-	fc = m_pf->get_grid(ilevel-1);	
-	
-	
-	int 
-		nx = uf->size(0), 
-		ny = uf->size(1), 
-		nz = uf->size(2);
-	
-	if( m_bperiodic && ilevel <= m_ilevelmin)
-		make_periodic( uf );
-	else if(!m_bperiodic)
-		setBC( ilevel );
-	
+	ff = m_pf->get_grid(ilevel);
+
+	uc = m_pu->get_grid(ilevel - 1);
+	fc = m_pf->get_grid(ilevel - 1);
+
+	int
+			nx = uf->size(0),
+			ny = uf->size(1),
+			nz = uf->size(2);
+
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uf);
+	else if (!m_bperiodic)
+		setBC(ilevel);
+
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npresmooth; ++i ){
-		
-		if( ilevel > m_ilevelmin )
-			interp().interp_coarse_fine(ilevel,*uc,*uf);
-		
-		if( m_smoother == opt::sm_gauss_seidel )
-			GaussSeidel( h, uf, ff );
-			
-		else if( m_smoother == opt::sm_jacobi )
-			Jacobi( h, uf, ff);		
-			
-		else if( m_smoother == opt::sm_sor )
-			SOR( h, uf, ff );
-		
-		if( m_bperiodic && ilevel <= m_ilevelmin )
-			make_periodic( uf );
+	for (unsigned i = 0; i < m_npresmooth; ++i)
+	{
+
+		if (ilevel > m_ilevelmin)
+			interp().interp_coarse_fine(ilevel, *uc, *uf);
+
+		if (m_smoother == opt::sm_gauss_seidel)
+			GaussSeidel(h, uf, ff);
+
+		else if (m_smoother == opt::sm_jacobi)
+			Jacobi(h, uf, ff);
+
+		else if (m_smoother == opt::sm_sor)
+			SOR(h, uf, ff);
+
+		if (m_bperiodic && ilevel <= m_ilevelmin)
+			make_periodic(uf);
 	}
-			
-	
-	m_gridop.restrict( *uf, *uc );
-	
+
+	m_gridop.restrict(*uf, *uc);
+
 	//... essential!!
-	if( m_bperiodic && ilevel <= m_ilevelmin )
-		make_periodic( uc );
-	else if( ilevel > m_ilevelmin )
-		interp().interp_coarse_fine(ilevel,*uc,*uf);
-		
-	
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uc);
+	else if (ilevel > m_ilevelmin)
+		interp().interp_coarse_fine(ilevel, *uc, *uf);
+
 	//....................................................................
 	//... we now use hard-coded restriction+operatore app, see below
 	/*meshvar_bnd Lu(*uf,false);
@@ -273,407 +264,383 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
 		for( int iy=0; iy<ny; ++iy )
 			for( int iz=0; iz<nz; ++iz )
 				Lu(ix,iy,iz) = m_scheme.apply( (*uf), ix, iy, iz )/h2;
-	
+
 	meshvar_bnd tLu(*uc,false);
-	
-	
+
+
 	//... restrict Lu
 	m_gridop.restrict( Lu, tLu );
 	Lu.deallocate();*/
-	//.................................................................... 
-	
-	int 
-		oxp = uf->offset(0),
-		oyp = uf->offset(1),
-		ozp = uf->offset(2);
-	
-	meshvar_bnd tLu(*uc,false);
-	#pragma omp parallel for
-	for( int ix=0; ix<nx/2; ++ix )
-	{	
-		int iix=2*ix;
-		for( int iy=0,iiy=0; iy<ny/2; ++iy,iiy+=2 )
-		
-		
-			for( int iz=0,iiz=0; iz<nz/2; ++iz,iiz+=2 )
-				tLu(ix+oxp,iy+oyp,iz+ozp) = 0.125 * (
-							 m_scheme.apply( (*uf), iix, iiy, iiz )
-							+m_scheme.apply( (*uf), iix, iiy, iiz+1 )
-							+m_scheme.apply( (*uf), iix, iiy+1, iiz )
-							+m_scheme.apply( (*uf), iix, iiy+1, iiz+1 )
-							+m_scheme.apply( (*uf), iix+1, iiy, iiz )
-							+m_scheme.apply( (*uf), iix+1, iiy, iiz+1 )
-							+m_scheme.apply( (*uf), iix+1, iiy+1, iiz )
-							+m_scheme.apply( (*uf), iix+1, iiy+1, iiz+1 )
-						)/h2;
+	//....................................................................
+
+	int
+			oxp = uf->offset(0),
+			oyp = uf->offset(1),
+			ozp = uf->offset(2);
+
+	meshvar_bnd tLu(*uc, false);
+#pragma omp parallel for
+	for (int ix = 0; ix < nx / 2; ++ix)
+	{
+		int iix = 2 * ix;
+		for (int iy = 0, iiy = 0; iy < ny / 2; ++iy, iiy += 2)
+
+			for (int iz = 0, iiz = 0; iz < nz / 2; ++iz, iiz += 2)
+				tLu(ix + oxp, iy + oyp, iz + ozp) = 0.125 * (m_scheme.apply((*uf), iix, iiy, iiz) + m_scheme.apply((*uf), iix, iiy, iiz + 1) + m_scheme.apply((*uf), iix, iiy + 1, iiz) + m_scheme.apply((*uf), iix, iiy + 1, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy, iiz) + m_scheme.apply((*uf), iix + 1, iiy, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz + 1)) / h2;
 	}
-	
+
 	//... restrict source term
-	m_gridop.restrict( *ff, *fc );
-	
+	m_gridop.restrict(*ff, *fc);
+
 	int oi, oj, ok;
 	oi = ff->offset(0);
 	oj = ff->offset(1);
 	ok = ff->offset(2);
-	
-	#pragma omp parallel for 
-	for( int ix=oi; ix<oi+(int)ff->size(0)/2; ++ix )
-		for( int iy=oj; iy<oj+(int)ff->size(1)/2; ++iy )
-			for( int iz=ok; iz<ok+(int)ff->size(2)/2; ++iz )
-				(*fc)(ix,iy,iz) += ((tLu( ix, iy, iz ) - (m_scheme.apply( *uc, ix, iy, iz )/(4.0*h2))));
-									
+
+#pragma omp parallel for
+	for (int ix = oi; ix < oi + (int)ff->size(0) / 2; ++ix)
+		for (int iy = oj; iy < oj + (int)ff->size(1) / 2; ++iy)
+			for (int iz = ok; iz < ok + (int)ff->size(2) / 2; ++iz)
+				(*fc)(ix, iy, iz) += ((tLu(ix, iy, iz) - (m_scheme.apply(*uc, ix, iy, iz) / (4.0 * h2))));
+
 	tLu.deallocate();
-	
-	meshvar_bnd ucsave(*uc,true);
-						
+
+	meshvar_bnd ucsave(*uc, true);
+
 	//... have we reached the end of the recursion or do we need to go up one level?
-	if( ilevel == 1 )
-		if( m_bperiodic )
-			(*uc)(0,0,0) = 0.0;
-		else 
-			(*uc)(0,0,0) = (m_scheme.rhs( (*uc), 0, 0, 0 ) + 4.0 * h2 * (*fc)(0,0,0))*c0;
+	if (ilevel == 1)
+		if (m_bperiodic)
+			(*uc)(0, 0, 0) = 0.0;
+		else
+			(*uc)(0, 0, 0) = (m_scheme.rhs((*uc), 0, 0, 0) + 4.0 * h2 * (*fc)(0, 0, 0)) * c0;
 	else
-		twoGrid( ilevel-1 );
-	
-	meshvar_bnd cc(*uc,false);
-	
-		
-	//... compute correction on coarse grid
-	#pragma omp parallel for
-	for( int ix=0; ix<(int)cc.size(0); ++ix )
-		for( int iy=0; iy<(int)cc.size(1); ++iy )
-			for( int iz=0; iz<(int)cc.size(2); ++iz )
-				cc(ix,iy,iz) = (*uc)(ix,iy,iz) - ucsave(ix,iy,iz);	
-		
+		twoGrid(ilevel - 1);
+
+	meshvar_bnd cc(*uc, false);
+
+//... compute correction on coarse grid
+#pragma omp parallel for
+	for (int ix = 0; ix < (int)cc.size(0); ++ix)
+		for (int iy = 0; iy < (int)cc.size(1); ++iy)
+			for (int iz = 0; iz < (int)cc.size(2); ++iz)
+				cc(ix, iy, iz) = (*uc)(ix, iy, iz) - ucsave(ix, iy, iz);
+
 	ucsave.deallocate();
 
-	if( m_bperiodic && ilevel <= m_ilevelmin )
-		make_periodic( &cc );
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(&cc);
+
+	m_gridop.prolong_add(cc, *uf);
 
-	m_gridop.prolong_add( cc, *uf );
-	
 	//... interpolate and apply coarse-fine boundary conditions on fine level
-	if( m_bperiodic && ilevel <= m_ilevelmin )
-		make_periodic( uf );
-	else if(!m_bperiodic)
-		setBC( ilevel );
-	
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uf);
+	else if (!m_bperiodic)
+		setBC(ilevel);
+
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npostsmooth; ++i ){
-		
-		if( ilevel > m_ilevelmin )
-			interp().interp_coarse_fine(ilevel,*uc,*uf);
+	for (unsigned i = 0; i < m_npostsmooth; ++i)
+	{
 
-		if( m_smoother == opt::sm_gauss_seidel )
-			GaussSeidel( h, uf, ff );
-		
-		else if( m_smoother == opt::sm_jacobi )
-			Jacobi( h, uf, ff);		
-		
-		else if( m_smoother == opt::sm_sor )
-			SOR( h, uf, ff );
-		
-		if( m_bperiodic && ilevel <= m_ilevelmin )
-			make_periodic( uf );
+		if (ilevel > m_ilevelmin)
+			interp().interp_coarse_fine(ilevel, *uc, *uf);
 
+		if (m_smoother == opt::sm_gauss_seidel)
+			GaussSeidel(h, uf, ff);
+
+		else if (m_smoother == opt::sm_jacobi)
+			Jacobi(h, uf, ff);
+
+		else if (m_smoother == opt::sm_sor)
+			SOR(h, uf, ff);
+
+		if (m_bperiodic && ilevel <= m_ilevelmin)
+			make_periodic(uf);
 	}
-
 }
 
-template< class S, class I, class O, typename T >
-double solver<S,I,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& f, int ilevel )
+template <class S, class I, class O>
+double solver<S, I, O>::compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &f, int ilevel)
 {
-	int 
-		nx = u.size(0), 
-		ny = u.size(1), 
-		nz = u.size(2);
-	
+	int
+			nx = u.size(0),
+			ny = u.size(1),
+			nz = u.size(2);
+
 	double err = 0.0, err2 = 0.0;
 	size_t count = 0;
 
-	double h = 1.0/(1ul<<ilevel), h2=h*h;
-	
-	#pragma omp parallel for reduction(+:err,count)
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-			  if( true )//fabs(unew(ix,iy,iz)) > 0.0 )//&& u(ix,iy,iz) != unew(ix,iy,iz) )
-				{
-				  //err += fabs(1.0 - (double)u(ix,iy,iz)/(double)unew(ix,iy,iz));
-				  /*err += fabs(((double)m_scheme.apply( u, ix, iy, iz )/h2 + (double)(f(ix,iy,iz)) ));
-				    err2 += fabs((double)f(ix,iy,iz));*/
+	double h = 1.0 / (1ul << ilevel), h2 = h * h;
 
-				  err += fabs( (double)m_scheme.apply( u, ix, iy, iz )/h2/(double)(f(ix,iy,iz)) + 1.0 );
+#pragma omp parallel for reduction(+ \
+																	 : err, count)
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if (true) // fabs(unew(ix,iy,iz)) > 0.0 )//&& u(ix,iy,iz) != unew(ix,iy,iz) )
+				{
+					// err += fabs(1.0 - (double)u(ix,iy,iz)/(double)unew(ix,iy,iz));
+					/*err += fabs(((double)m_scheme.apply( u, ix, iy, iz )/h2 + (double)(f(ix,iy,iz)) ));
+						err2 += fabs((double)f(ix,iy,iz));*/
+
+					err += fabs((double)m_scheme.apply(u, ix, iy, iz) / h2 / (double)(f(ix, iy, iz)) + 1.0);
 					++count;
 				}
-	
-	  if( count != 0 )
-	    err /= count; 
-	  
+
+	if (count != 0)
+		err /= count;
+
 	return err;
 }
 
-template< class S, class I, class O, typename T >
-double solver<S,I,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose )
+template <class S, class I, class O>
+double solver<S, I, O>::compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
 {
 	double maxerr = 0.0;
 
-	for( unsigned ilevel=uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel )
+	for (unsigned ilevel = uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel)
 	{
-		int 
-		  nx = uh.get_grid(ilevel)->size(0), 
-		  ny = uh.get_grid(ilevel)->size(1), 
-		  nz = uh.get_grid(ilevel)->size(2);
-	
+		int
+				nx = uh.get_grid(ilevel)->size(0),
+				ny = uh.get_grid(ilevel)->size(1),
+				nz = uh.get_grid(ilevel)->size(2);
+
 		double err = 0.0, mean_res = 0.0;
 		size_t count = 0;
 
-		double h = 1.0/(1ul<<ilevel), h2=h*h;
-	
-                #pragma omp parallel for reduction(+:err,count)
-		for( int ix=0; ix<nx; ++ix )
-		  for( int iy=0; iy<ny; ++iy )
-		    for( int iz=0; iz<nz; ++iz )
-			{
-			  double res =  (double)m_scheme.apply( *uh.get_grid(ilevel), ix, iy, iz ) + h2 * (double)((*fh.get_grid(ilevel))(ix,iy,iz));
-			  double val = (*uh.get_grid(ilevel))( ix, iy, iz );
+		double h = 1.0 / (1ul << ilevel), h2 = h * h;
 
-			  if( fabs(val) > 0.0 )
-			    {
-			      err += fabs( res/val );
-			      mean_res += fabs(res);
-			      ++count;
-			    }
-			}
-	
-		if( count != 0 )
-		  {
-		    err /= count; 
-		    mean_res /= count;
-		  }
-		if( verbose )
+#pragma omp parallel for reduction(+ \
+																	 : err, count)
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+				{
+					double res = (double)m_scheme.apply(*uh.get_grid(ilevel), ix, iy, iz) + h2 * (double)((*fh.get_grid(ilevel))(ix, iy, iz));
+					double val = (*uh.get_grid(ilevel))(ix, iy, iz);
+
+					if (fabs(val) > 0.0)
+					{
+						err += fabs(res / val);
+						mean_res += fabs(res);
+						++count;
+					}
+				}
+
+		if (count != 0)
+		{
+			err /= count;
+			mean_res /= count;
+		}
+		if (verbose)
 			std::cout << "      Level " << std::setw(6) << ilevel << ",   Error = " << err << std::endl;
 
-		music::dlog.Print("[mg]      level %3d,  residual %g,  rel. error %g",ilevel, mean_res, err);
-		
-		maxerr = std::max(maxerr,err);
-		
+		music::dlog.Print("[mg]      level %3d,  residual %g,  rel. error %g", ilevel, mean_res, err);
+
+		maxerr = std::max(maxerr, err);
 	}
 	return maxerr;
 }
 
-template< class S, class I, class O, typename T >
-double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose )
+template <class S, class I, class O>
+double solver<S, I, O>::compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
 {
-	if( m_is_ini )
-		m_residu_ini.assign( uh.levelmax()+1, 0.0 );
-	
-	double maxerr=0.0;
-	
-	for( unsigned ilevel=uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel )
+	if (m_is_ini)
+		m_residu_ini.assign(uh.levelmax() + 1, 0.0);
+
+	double maxerr = 0.0;
+
+	for (unsigned ilevel = uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel)
 	{
-		int 
-		nx = uh.get_grid(ilevel)->size(0), 
-		ny = uh.get_grid(ilevel)->size(1), 
-		nz = uh.get_grid(ilevel)->size(2);
-		
-		double h = 1.0/(1<<ilevel), h2=h*h;
+		int
+				nx = uh.get_grid(ilevel)->size(0),
+				ny = uh.get_grid(ilevel)->size(1),
+				nz = uh.get_grid(ilevel)->size(2);
+
+		double h = 1.0 / (1 << ilevel), h2 = h * h;
 		double sum = 0.0, sumd2 = 0.0;
 		size_t count = 0;
-		
-		#pragma omp parallel for reduction(+:sum,sumd2,count)
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
+
+#pragma omp parallel for reduction(+ \
+																	 : sum, sumd2, count)
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
 				{
-					double d = (double)(*fh.get_grid(ilevel))(ix,iy,iz);
-					sumd2 += d*d;
-					
-					double r = ((double)m_scheme.apply( *uh.get_grid(ilevel), ix, iy, iz )/h2 + (double)(*fh.get_grid(ilevel))(ix,iy,iz));
-					sum += r*r;
+					double d = (double)(*fh.get_grid(ilevel))(ix, iy, iz);
+					sumd2 += d * d;
+
+					double r = ((double)m_scheme.apply(*uh.get_grid(ilevel), ix, iy, iz) / h2 + (double)(*fh.get_grid(ilevel))(ix, iy, iz));
+					sum += r * r;
 
 					++count;
 				}
-		
-		if( m_is_ini )
-			m_residu_ini[ilevel] =  sqrt(sum)/count;
-		
-		double err_abs = sqrt(sum/count);
-		double err_rel = err_abs / sqrt(sumd2/count);
-		
-		if( verbose && !m_is_ini )
-			std::cout << "      Level " << std::setw(6) << ilevel << ",   Error = " << err_rel << std::endl;		
-		
-		music::dlog.Print("[mg]      level %3d,  rms residual %g,  rel. error %g",ilevel, err_abs, err_rel);
-		
-		if( err_rel > maxerr )
+
+		if (m_is_ini)
+			m_residu_ini[ilevel] = sqrt(sum) / count;
+
+		double err_abs = sqrt(sum / count);
+		double err_rel = err_abs / sqrt(sumd2 / count);
+
+		if (verbose && !m_is_ini)
+			std::cout << "      Level " << std::setw(6) << ilevel << ",   Error = " << err_rel << std::endl;
+
+		music::dlog.Print("[mg]      level %3d,  rms residual %g,  rel. error %g", ilevel, err_abs, err_rel);
+
+		if (err_rel > maxerr)
 			maxerr = err_rel;
-		
 	}
-	
-	if( m_is_ini )
+
+	if (m_is_ini)
 		m_is_ini = false;
-	
+
 	return maxerr;
 }
 
-
-template< class S, class I, class O, typename T >
-double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool verbose )
+template <class S, class I, class O>
+double solver<S, I, O>::solve(GridHierarchy<real_t> &uh, double acc, double h, bool verbose)
 {
 
 	double err, maxerr = 1e30;
 	unsigned niter = 0;
-	
+
 	bool fullverbose = false;
-	
+
 	m_pu = &uh;
-	
-	//err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
-	
+
+	// err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
+
 	//... iterate ...//
 	while (true)
 	{
-		
+
 		music::ulog.Print("Performing multi-grid V-cycle...");
-		twoGrid( uh.levelmax() );
-		
-		//err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
-		err = compute_error( *m_pu, *m_pf, fullverbose );
+		twoGrid(uh.levelmax());
+
+		// err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
+		err = compute_error(*m_pu, *m_pf, fullverbose);
 		++niter;
-		
-		if( fullverbose ){
-			music::ulog.Print("  multigrid iteration %3d, maximum RMS residual = %g", niter, err );
+
+		if (fullverbose)
+		{
+			music::ulog.Print("  multigrid iteration %3d, maximum RMS residual = %g", niter, err);
 			std::cout << "   - Step No. " << std::setw(3) << niter << ", Max Err = " << err << std::endl;
 			std::cout << "     ---------------------------------------------------\n";
 		}
-		
-		if( err < maxerr )
+
+		if (err < maxerr)
 			maxerr = err;
-			
-		if( (niter > 1) && ((err < acc) || (niter > 20)) )
+
+		if ((niter > 1) && ((err < acc) || (niter > 20)))
 			break;
-	}		
-	
-	if( err > acc )
-	{	
-		std::cout << "Error : no convergence in Poisson solver" << std::endl;
-		music::elog.Print("No convergence in Poisson solver, final error: %g.",err);
-	}
-	else if( verbose )
-	{	
-		std::cout << " - Converged in " << niter << " steps to " << maxerr << std::endl;
-		music::ulog.Print("Poisson solver converged to max. error of %g in %d steps.",err,niter);
 	}
 
-	
+	if (err > acc)
+	{
+		std::cout << "Error : no convergence in Poisson solver" << std::endl;
+		music::elog.Print("No convergence in Poisson solver, final error: %g.", err);
+	}
+	else if (verbose)
+	{
+		std::cout << " - Converged in " << niter << " steps to " << maxerr << std::endl;
+		music::ulog.Print("Poisson solver converged to max. error of %g in %d steps.", err, niter);
+	}
+
 	//.. make sure that the RHS does not contain the FAS corrections any more
-	for( int i=m_pf->levelmax(); i>0; --i )
-		m_gridop.restrict( *m_pf->get_grid(i), *m_pf->get_grid(i-1) );
-	
-	
+	for (int i = m_pf->levelmax(); i > 0; --i)
+		m_gridop.restrict(*m_pf->get_grid(i), *m_pf->get_grid(i - 1));
+
 	return err;
 }
 
-
-
-//TODO: this only works for 2nd order! (but actually not needed)
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::setBC( unsigned ilevel )
+// TODO: this only works for 2nd order! (but actually not needed)
+template <class S, class I, class O>
+void solver<S, I, O>::setBC(unsigned ilevel)
 {
 	//... set only on level before additional refinement starts
-	if( ilevel == m_ilevelmin )
+	if (ilevel == m_ilevelmin)
 	{
-		MeshvarBnd<T> *u = m_pu->get_grid(ilevel);
+		MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
 		int
-			nx = u->size(0), 
-			ny = u->size(1), 
-			nz = u->size(2);
-			
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
+				nx = u->size(0),
+				ny = u->size(1),
+				nz = u->size(2);
+
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
 			{
-				(*u)(-1,iy,iz) = 2.0*(*m_pubnd)(-1,iy,iz) - (*u)(0,iy,iz);
-				(*u)(nx,iy,iz) = 2.0*(*m_pubnd)(nx,iy,iz) - (*u)(nx-1,iy,iz);;
+				(*u)(-1, iy, iz) = 2.0 * (*m_pubnd)(-1, iy, iz) - (*u)(0, iy, iz);
+				(*u)(nx, iy, iz) = 2.0 * (*m_pubnd)(nx, iy, iz) - (*u)(nx - 1, iy, iz);
+				;
 			}
-		
-		for( int ix=0; ix<nx; ++ix )
-			for( int iz=0; iz<nz; ++iz )
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iz = 0; iz < nz; ++iz)
 			{
-				(*u)(ix,-1,iz) = 2.0*(*m_pubnd)(ix,-1,iz) - (*u)(ix,0,iz);
-				(*u)(ix,ny,iz) = 2.0*(*m_pubnd)(ix,ny,iz) - (*u)(ix,ny-1,iz);
+				(*u)(ix, -1, iz) = 2.0 * (*m_pubnd)(ix, -1, iz) - (*u)(ix, 0, iz);
+				(*u)(ix, ny, iz) = 2.0 * (*m_pubnd)(ix, ny, iz) - (*u)(ix, ny - 1, iz);
 			}
-		
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
 			{
-				(*u)(ix,iy,-1) = 2.0*(*m_pubnd)(ix,iy,-1) - (*u)(ix,iy,0);
-				(*u)(ix,iy,nz) = 2.0*(*m_pubnd)(ix,iy,nz) - (*u)(ix,iy,nz-1);
-			}		
-		
-		
-		
+				(*u)(ix, iy, -1) = 2.0 * (*m_pubnd)(ix, iy, -1) - (*u)(ix, iy, 0);
+				(*u)(ix, iy, nz) = 2.0 * (*m_pubnd)(ix, iy, nz) - (*u)(ix, iy, nz - 1);
+			}
 	}
 }
 
-
-
 //... enforce periodic boundary conditions
-template< class S, class I, class O, typename T >
-void solver<S,I,O,T>::make_periodic( MeshvarBnd<T> *u )
+template <class S, class I, class O>
+void solver<S, I, O>::make_periodic(MeshvarBnd<real_t> *u)
 {
-	
 
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
 	int nb = u->m_nbnd;
-	
-		
-	//if( u->offset(0) == 0 )
-		for( int iy=-nb; iy<ny+nb; ++iy )
-			for( int iz=-nb; iz<nz+nb; ++iz )
+
+	// if( u->offset(0) == 0 )
+	for (int iy = -nb; iy < ny + nb; ++iy)
+		for (int iz = -nb; iz < nz + nb; ++iz)
+		{
+			int iiy((iy + ny) % ny), iiz((iz + nz) % nz);
+
+			for (int i = -nb; i < 0; ++i)
 			{
-				int iiy( (iy+ny)%ny ), iiz( (iz+nz)%nz );
-				
-				for( int i=-nb; i<0; ++i )
-				{
-					(*u)(i,iy,iz) = (*u)(nx+i,iiy,iiz);
-					(*u)(nx-1-i,iy,iz) = (*u)(-1-i,iiy,iiz);	
-				}
-				
+				(*u)(i, iy, iz) = (*u)(nx + i, iiy, iiz);
+				(*u)(nx - 1 - i, iy, iz) = (*u)(-1 - i, iiy, iiz);
 			}
-	
-	//if( u->offset(1) == 0 )
-		for( int ix=-nb; ix<nx+nb; ++ix )
-			for( int iz=-nb; iz<nz+nb; ++iz )
+		}
+
+	// if( u->offset(1) == 0 )
+	for (int ix = -nb; ix < nx + nb; ++ix)
+		for (int iz = -nb; iz < nz + nb; ++iz)
+		{
+			int iix((ix + nx) % nx), iiz((iz + nz) % nz);
+
+			for (int i = -nb; i < 0; ++i)
 			{
-				int iix( (ix+nx)%nx ), iiz( (iz+nz)%nz );
-				
-				for( int i=-nb; i<0; ++i )
-				{
-					(*u)(ix,i,iz) = (*u)(iix,ny+i,iiz);
-					(*u)(ix,ny-1-i,iz) = (*u)(iix,-1-i,iiz);
-				}
+				(*u)(ix, i, iz) = (*u)(iix, ny + i, iiz);
+				(*u)(ix, ny - 1 - i, iz) = (*u)(iix, -1 - i, iiz);
 			}
-	
-	//if( u->offset(2) == 0 )
-		for( int ix=-nb; ix<nx+nb; ++ix )
-			for( int iy=-nb; iy<ny+nb; ++iy )
+		}
+
+	// if( u->offset(2) == 0 )
+	for (int ix = -nb; ix < nx + nb; ++ix)
+		for (int iy = -nb; iy < ny + nb; ++iy)
+		{
+			int iix((ix + nx) % nx), iiy((iy + ny) % ny);
+
+			for (int i = -nb; i < 0; ++i)
 			{
-				int iix( (ix+nx)%nx ), iiy( (iy+ny)%ny );
-				
-				for( int i=-nb; i<0; ++i )
-				{
-					(*u)(ix,iy,i) = (*u)(iix,iiy,nz+i);
-					(*u)(ix,iy,nz-1-i) = (*u)(iix,iiy,-1-i);
-				}
+				(*u)(ix, iy, i) = (*u)(iix, iiy, nz + i);
+				(*u)(ix, iy, nz - 1 - i) = (*u)(iix, iiy, -1 - i);
 			}
-	
+		}
 }
 
-
 END_MULTIGRID_NAMESPACE
- 
-#endif
+
+
diff --git a/src/plugins/output_enzo.cc b/src/plugins/output_enzo.cc
index 2e0ae5d..9a7e0dd 100644
--- a/src/plugins/output_enzo.cc
+++ b/src/plugins/output_enzo.cc
@@ -230,19 +230,11 @@ protected:
 			HDFCreateFile(filename);
 			write_sim_header(filename, the_sim_header);
 
-#ifdef SINGLE_PRECISION
 			//... create full array in file
-			HDFHyperslabWriter3Ds<float> *slab_writer = new HDFHyperslabWriter3Ds<float>(filename, enzoname, nsz);
+			HDFHyperslabWriter3Ds<real_t> *slab_writer = new HDFHyperslabWriter3Ds<real_t>(filename, enzoname, nsz);
 
 			//... create buffer
-			float *data_buf = new float[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
-#else
-			//... create full array in file
-			HDFHyperslabWriter3Ds<double> *slab_writer = new HDFHyperslabWriter3Ds<double>(filename, enzoname, nsz);
-
-			//... create buffer
-			double *data_buf = new double[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
-#endif
+			real_t *data_buf = new real_t[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
 
 			//... write slice by slice
 			size_t slices_written = 0;
diff --git a/src/plugins/output_gadget2.cc b/src/plugins/output_gadget2.cc
index 32da217..866548b 100644
--- a/src/plugins/output_gadget2.cc
+++ b/src/plugins/output_gadget2.cc
@@ -1390,7 +1390,5 @@ public:
 namespace
 {
 	output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2");
-#ifndef SINGLE_PRECISION
 	output_plugin_creator_concrete<gadget2_output_plugin<double>> creator2("gadget2_double");
-#endif
 }
diff --git a/src/plugins/output_gadget2_2comp.cc b/src/plugins/output_gadget2_2comp.cc
index 1a7ccc4..327f81e 100644
--- a/src/plugins/output_gadget2_2comp.cc
+++ b/src/plugins/output_gadget2_2comp.cc
@@ -1,11 +1,11 @@
 /*
- 
+
  output_gadget2.cc - This file is part of MUSIC -
- a code to generate multi-scale initial conditions 
- for cosmological simulations 
- 
+ a code to generate multi-scale initial conditions
+ for cosmological simulations
+
  Copyright (C) 2010  Oliver Hahn
- 
+
  */
 
 #include <fstream>
@@ -14,1676 +14,1622 @@
 #include "mg_interp.hh"
 #include "mesh.hh"
 
-template< typename T_store=float >
+template <typename T_store = float>
 class gadget2_2comp_output_plugin : public output_plugin
 {
 protected:
-	
 	std::ofstream ofs_;
 	bool bmultimass_;
-	
-	
+
 	typedef struct io_header
 	{
-		unsigned int npart[6];                        
-		double mass[6];                      
-		double time;                         
-		double redshift;                     
-		int flag_sfr;                        
-		int flag_feedback;                   
-		unsigned int npartTotal[6];          
-		int flag_cooling;                    
-		int num_files;                       
-		double BoxSize;                      
-		double Omega0;                       
-		double OmegaLambda;                  
-		double HubbleParam;                  
-		int flag_stellarage;                 
-		int flag_metals;                     
-		unsigned int npartTotalHighWord[6];  
-		int  flag_entropy_instead_u;         
-                int flag_doubleprecision;
-                int flag_ic_info; 
-		char fill[52];                       
-	}header;                       
-	
-	
+		unsigned int npart[6];
+		double mass[6];
+		double time;
+		double redshift;
+		int flag_sfr;
+		int flag_feedback;
+		unsigned int npartTotal[6];
+		int flag_cooling;
+		int num_files;
+		double BoxSize;
+		double Omega0;
+		double OmegaLambda;
+		double HubbleParam;
+		int flag_stellarage;
+		int flag_metals;
+		unsigned int npartTotalHighWord[6];
+		int flag_entropy_instead_u;
+		int flag_doubleprecision;
+		int flag_ic_info;
+		char fill[52];
+	} header;
+
 	header header_;
-	
+
 	std::string fname;
-	
+
 	bool do_glass_;
 	std::string fname_glass_baryon_, fname_glass_cdm_;
-	
-	enum iofields {
-		id_dm_mass, id_dm_vel, id_dm_pos, id_gas_vel, id_gas_rho, id_gas_temp, id_gas_pos
+
+	enum iofields
+	{
+		id_dm_mass,
+		id_dm_vel,
+		id_dm_pos,
+		id_gas_vel,
+		id_gas_rho,
+		id_gas_temp,
+		id_gas_pos
 	};
-	
+
 	size_t np_fine_gas_, np_fine_dm_, np_coarse_dm_;
-	
+
 	size_t block_buf_size_;
 	unsigned long long npartmax_;
 	unsigned nfiles_;
-	
-	//bool bbndparticles_;
+
+	// bool bbndparticles_;
 	bool bmorethan2bnd_;
 	bool kpcunits_;
 	double YHe_;
-	
-	void distribute_particles( unsigned nfiles, size_t nfine_dm, size_t nfine_gas, size_t ncoarse, 
-                              std::vector<unsigned>& nfdm_pf, std::vector<unsigned>& nfgas_pf, std::vector<unsigned>& nc_pf )
-    {
-        nfdm_pf.assign( nfiles, 0 );
-        nfgas_pf.assign( nfiles, 0 );
-        nc_pf.assign( nfiles, 0 );
-        
-        size_t ntotal = nfine_dm + nfine_gas + ncoarse;
-        size_t nnominal = (size_t)((double)ntotal/(double)nfiles);
-        
-        size_t nf_dm_assigned = 0, nf_gas_assigned = 0, nc_assigned = 0;
-        
-        for( unsigned i=0; i<nfiles; ++i )
-        {
-            if( nfine_gas > 0 )
-            {
-                nfdm_pf[i] = std::min( nnominal/2ul, nfine_dm-nf_dm_assigned );
-                nf_dm_assigned += nfdm_pf[i];
-                nfgas_pf[i] = std::min( nnominal/2ul, nfine_gas-nf_gas_assigned );
-                nf_gas_assigned += nfgas_pf[i];
-                
-            }else{
-                nfdm_pf[i] = std::min( nnominal, nfine_dm-nf_dm_assigned );
-                nf_dm_assigned += nfdm_pf[i];
-            }
-            
-            // once all fine particles are assigned, start with the coarse
-            if( nf_dm_assigned+nf_gas_assigned == nfine_dm+nfine_gas )
-            {
-                nc_pf[i] = std::min( nnominal-(size_t)(nfdm_pf[i]+nfgas_pf[i]), ncoarse-nc_assigned );
-                nc_assigned += nc_pf[i];
-            }
-            
-        }
-        
-        // make sure all particles are assigned
-        nfdm_pf[ nfiles-1 ]     += nfine_dm-nf_dm_assigned;
-        nfgas_pf[ nfiles-1 ]    += nfine_gas-nf_gas_assigned;
-        nc_pf[ nfiles-1 ]       += ncoarse-nc_assigned;
-        
-    }
-	
-	std::ifstream& open_and_check( std::string ffname, size_t npart )
+
+	void distribute_particles(unsigned nfiles, size_t nfine_dm, size_t nfine_gas, size_t ncoarse,
+														std::vector<unsigned> &nfdm_pf, std::vector<unsigned> &nfgas_pf, std::vector<unsigned> &nc_pf)
 	{
-		std::ifstream ifs( ffname.c_str(), std::ios::binary );
-		unsigned long long blk, expected;
-		ifs.read( (char*)&blk, sizeof(unsigned long long) );
-                expected = ((unsigned long long) npart*(unsigned long long)sizeof(T_store));
-		if( blk != expected )
-		{	
-			music::elog.Print("Internal consistency error in gadget2 output plug-in, open_and_check");
-			music::elog.Print("Expected %d particles (%lld bytes) in temp file %s but found %lld",npart, expected ,ffname.c_str(), blk);
-			//throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
+		nfdm_pf.assign(nfiles, 0);
+		nfgas_pf.assign(nfiles, 0);
+		nc_pf.assign(nfiles, 0);
+
+		size_t ntotal = nfine_dm + nfine_gas + ncoarse;
+		size_t nnominal = (size_t)((double)ntotal / (double)nfiles);
+
+		size_t nf_dm_assigned = 0, nf_gas_assigned = 0, nc_assigned = 0;
+
+		for (unsigned i = 0; i < nfiles; ++i)
+		{
+			if (nfine_gas > 0)
+			{
+				nfdm_pf[i] = std::min(nnominal / 2ul, nfine_dm - nf_dm_assigned);
+				nf_dm_assigned += nfdm_pf[i];
+				nfgas_pf[i] = std::min(nnominal / 2ul, nfine_gas - nf_gas_assigned);
+				nf_gas_assigned += nfgas_pf[i];
+			}
+			else
+			{
+				nfdm_pf[i] = std::min(nnominal, nfine_dm - nf_dm_assigned);
+				nf_dm_assigned += nfdm_pf[i];
+			}
+
+			// once all fine particles are assigned, start with the coarse
+			if (nf_dm_assigned + nf_gas_assigned == nfine_dm + nfine_gas)
+			{
+				nc_pf[i] = std::min(nnominal - (size_t)(nfdm_pf[i] + nfgas_pf[i]), ncoarse - nc_assigned);
+				nc_assigned += nc_pf[i];
+			}
 		}
-		
+
+		// make sure all particles are assigned
+		nfdm_pf[nfiles - 1] += nfine_dm - nf_dm_assigned;
+		nfgas_pf[nfiles - 1] += nfine_gas - nf_gas_assigned;
+		nc_pf[nfiles - 1] += ncoarse - nc_assigned;
+	}
+
+	std::ifstream &open_and_check(std::string ffname, size_t npart)
+	{
+		std::ifstream ifs(ffname.c_str(), std::ios::binary);
+		unsigned long long blk, expected;
+		ifs.read((char *)&blk, sizeof(unsigned long long));
+		expected = ((unsigned long long)npart * (unsigned long long)sizeof(T_store));
+		if (blk != expected)
+		{
+			music::elog.Print("Internal consistency error in gadget2 output plug-in, open_and_check");
+			music::elog.Print("Expected %d particles (%lld bytes) in temp file %s but found %lld", npart, expected, ffname.c_str(), blk);
+			// throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
+		}
+
 		return ifs;
 	}
-	
+
 	class pistream : public std::ifstream
 	{
 	public:
-		pistream (std::string fname, size_t npart, size_t offset=0 )
-		: std::ifstream( fname.c_str(), std::ios::binary )
+		pistream(std::string fname, size_t npart, size_t offset = 0)
+				: std::ifstream(fname.c_str(), std::ios::binary)
 		{
 			size_t blk;
-			
-			if( !this->good() )
-			{	
+
+			if (!this->good())
+			{
 				music::elog.Print("Could not open buffer file in gadget2 output plug-in");
 				throw std::runtime_error("Could not open buffer file in gadget2 output plug-in");
 			}
-			
-			this->read( (char*)&blk, sizeof(size_t) );
-			
-			if( blk != npart*sizeof(T_store) )
-			{	
+
+			this->read((char *)&blk, sizeof(size_t));
+
+			if (blk != npart * sizeof(T_store))
+			{
 				music::elog.Print("Internal consistency error in gadget2 output plug-in");
-				music::elog.Print("Expected %ld bytes in temp file but found %ld",npart*sizeof(T_store),blk);
+				music::elog.Print("Expected %ld bytes in temp file but found %ld", npart * sizeof(T_store), blk);
 				throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
 			}
-            
-			this->seekg( offset+sizeof(size_t), std::ios::beg );
+
+			this->seekg(offset + sizeof(size_t), std::ios::beg);
 		}
-		
-		pistream ()
+
+		pistream()
 		{
-			
 		}
-		
-		void open(std::string fname, size_t npart, size_t offset=0 )
+
+		void open(std::string fname, size_t npart, size_t offset = 0)
 		{
-			std::ifstream::open( fname.c_str(), std::ios::binary );
+			std::ifstream::open(fname.c_str(), std::ios::binary);
 			size_t blk;
-			
-			if( !this->good() )
-			{	
-				music::elog.Print("Could not open buffer file \'%s\' in gadget2 output plug-in",fname.c_str());
+
+			if (!this->good())
+			{
+				music::elog.Print("Could not open buffer file \'%s\' in gadget2 output plug-in", fname.c_str());
 				throw std::runtime_error("Could not open buffer file in gadget2 output plug-in");
 			}
-			
-			this->read( (char*)&blk, sizeof(size_t) );
-			
-			if( blk != npart*sizeof(T_store) )
-			{	
+
+			this->read((char *)&blk, sizeof(size_t));
+
+			if (blk != npart * sizeof(T_store))
+			{
 				music::elog.Print("Internal consistency error in gadget2 output plug-in");
-				music::elog.Print("Expected %ld bytes in temp file but found %ld",npart*sizeof(T_store),blk);
+				music::elog.Print("Expected %ld bytes in temp file but found %ld", npart * sizeof(T_store), blk);
 				throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
 			}
-            
-			this->seekg( offset+sizeof(size_t), std::ios::beg );
+
+			this->seekg(offset + sizeof(size_t), std::ios::beg);
 		}
 	};
-    
-    class postream : public std::fstream
+
+	class postream : public std::fstream
 	{
 	public:
-		postream (std::string fname, size_t npart, size_t offset=0 )
-		: std::fstream( fname.c_str(), std::ios::binary|std::ios::in|std::ios::out )
+		postream(std::string fname, size_t npart, size_t offset = 0)
+				: std::fstream(fname.c_str(), std::ios::binary | std::ios::in | std::ios::out)
 		{
 			size_t blk;
-			
-			if( !this->good() )
-			{	
+
+			if (!this->good())
+			{
 				music::elog.Print("Could not open buffer file in gadget2 output plug-in");
 				throw std::runtime_error("Could not open buffer file in gadget2 output plug-in");
 			}
-			
-            this->read( (char*)&blk, sizeof(size_t) );
-			
-			if( blk != npart*sizeof(T_store) )
-			{	
+
+			this->read((char *)&blk, sizeof(size_t));
+
+			if (blk != npart * sizeof(T_store))
+			{
 				music::elog.Print("Internal consistency error in gadget2 output plug-in");
-				music::elog.Print("Expected %ld bytes in temp file but found %ld",npart*sizeof(T_store),blk);
+				music::elog.Print("Expected %ld bytes in temp file but found %ld", npart * sizeof(T_store), blk);
 				throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
 			}
-            
-            this->seekg( offset, std::ios::cur );
-            this->seekp( offset+sizeof(size_t), std::ios::beg );
+
+			this->seekg(offset, std::ios::cur);
+			this->seekp(offset + sizeof(size_t), std::ios::beg);
 		}
-		
-		postream ()
+
+		postream()
 		{
-			
 		}
-		
-		void open(std::string fname, size_t npart, size_t offset=0 )
+
+		void open(std::string fname, size_t npart, size_t offset = 0)
 		{
-            if( is_open() )
-                this->close();
-            
-			std::fstream::open( fname.c_str(), std::ios::binary|std::ios::in|std::ios::out );
+			if (is_open())
+				this->close();
+
+			std::fstream::open(fname.c_str(), std::ios::binary | std::ios::in | std::ios::out);
 			size_t blk;
-			
-			if( !this->good() )
-			{	
-				music::elog.Print("Could not open buffer file \'%s\' in gadget2 output plug-in",fname.c_str());
+
+			if (!this->good())
+			{
+				music::elog.Print("Could not open buffer file \'%s\' in gadget2 output plug-in", fname.c_str());
 				throw std::runtime_error("Could not open buffer file in gadget2 output plug-in");
 			}
-			
-            this->read( (char*)&blk, sizeof(size_t) );
-			
-			if( blk != npart*sizeof(T_store) )
-			{	
+
+			this->read((char *)&blk, sizeof(size_t));
+
+			if (blk != npart * sizeof(T_store))
+			{
 				music::elog.Print("Internal consistency error in gadget2 output plug-in");
-				music::elog.Print("Expected %ld bytes in temp file but found %ld",npart*sizeof(T_store),blk);
+				music::elog.Print("Expected %ld bytes in temp file but found %ld", npart * sizeof(T_store), blk);
 				throw std::runtime_error("Internal consistency error in gadget2 output plug-in");
 			}
-            
-            this->seekg( offset, std::ios::cur );
-            this->seekp( offset+sizeof(size_t), std::ios::beg );
+
+			this->seekg(offset, std::ios::cur);
+			this->seekp(offset + sizeof(size_t), std::ios::beg);
 		}
 	};
-	
-	void assemble_gadget_file( void )
+
+	void assemble_gadget_file(void)
 	{
-		
-		
+
 		//............................................................................
 		//... copy from the temporary files, interleave the data and save ............
-		
-		char fnx[256],fny[256],fnz[256],fnvx[256],fnvy[256],fnvz[256],fnm[256];
-		char fnbx[256], fnby[256], fnbz[256], fnbvx[256], fnbvy[256], fnbvz[256];
-		
-		sprintf( fnx,  "___ic_temp_%05d.bin", 100*id_dm_pos+0 );
-		sprintf( fny,  "___ic_temp_%05d.bin", 100*id_dm_pos+1 );
-		sprintf( fnz,  "___ic_temp_%05d.bin", 100*id_dm_pos+2 );
-		sprintf( fnvx, "___ic_temp_%05d.bin", 100*id_dm_vel+0 );
-		sprintf( fnvy, "___ic_temp_%05d.bin", 100*id_dm_vel+1 );
-		sprintf( fnvz, "___ic_temp_%05d.bin", 100*id_dm_vel+2 );
-		sprintf( fnm,  "___ic_temp_%05d.bin", 100*id_dm_mass  );
 
-		sprintf( fnbx,  "___ic_temp_%05d.bin", 100*id_gas_pos+0 );
-		sprintf( fnby,  "___ic_temp_%05d.bin", 100*id_gas_pos+1 );
-		sprintf( fnbz,  "___ic_temp_%05d.bin", 100*id_gas_pos+2 );
-		sprintf( fnbvx, "___ic_temp_%05d.bin", 100*id_gas_vel+0 );
-		sprintf( fnbvy, "___ic_temp_%05d.bin", 100*id_gas_vel+1 );
-		sprintf( fnbvz, "___ic_temp_%05d.bin", 100*id_gas_vel+2 );
+		char fnx[256], fny[256], fnz[256], fnvx[256], fnvy[256], fnvz[256], fnm[256];
+		char fnbx[256], fnby[256], fnbz[256], fnbvx[256], fnbvy[256], fnbvz[256];
+
+		sprintf(fnx, "___ic_temp_%05d.bin", 100 * id_dm_pos + 0);
+		sprintf(fny, "___ic_temp_%05d.bin", 100 * id_dm_pos + 1);
+		sprintf(fnz, "___ic_temp_%05d.bin", 100 * id_dm_pos + 2);
+		sprintf(fnvx, "___ic_temp_%05d.bin", 100 * id_dm_vel + 0);
+		sprintf(fnvy, "___ic_temp_%05d.bin", 100 * id_dm_vel + 1);
+		sprintf(fnvz, "___ic_temp_%05d.bin", 100 * id_dm_vel + 2);
+		sprintf(fnm, "___ic_temp_%05d.bin", 100 * id_dm_mass);
+
+		sprintf(fnbx, "___ic_temp_%05d.bin", 100 * id_gas_pos + 0);
+		sprintf(fnby, "___ic_temp_%05d.bin", 100 * id_gas_pos + 1);
+		sprintf(fnbz, "___ic_temp_%05d.bin", 100 * id_gas_pos + 2);
+		sprintf(fnbvx, "___ic_temp_%05d.bin", 100 * id_gas_vel + 0);
+		sprintf(fnbvy, "___ic_temp_%05d.bin", 100 * id_gas_vel + 1);
+		sprintf(fnbvz, "___ic_temp_%05d.bin", 100 * id_gas_vel + 2);
 
 		pistream iffs1, iffs2, iffs3;
-		
-		/*const size_t 
+
+		/*const size_t
 			nptot = header_.npart[1]+header_.npart[2]+header_.npart[5],
 			npgas = header_.npart[2],
 			npcdm = nptot-npgas;*/
-		
-		const size_t 
-			nptot = np_fine_gas_+np_fine_dm_+np_coarse_dm_,
-			//npgas = np_fine_gas_,
-			npcdm = np_fine_dm_+np_coarse_dm_;
-			
+
+		const size_t
+				nptot = np_fine_gas_ + np_fine_dm_ + np_coarse_dm_,
+				// npgas = np_fine_gas_,
+				npcdm = np_fine_dm_ + np_coarse_dm_;
+
 		size_t
-			wrote_coarse = 0,
-			wrote_gas  = 0,
-			wrote_dm   = 0;
-		
+				wrote_coarse = 0,
+				wrote_gas = 0,
+				wrote_dm = 0;
+
 		size_t
-			npleft = nptot, 
-			n2read = std::min(block_buf_size_,npleft);
-		
-		if( header_.npart[5] > 0 )
+				npleft = nptot,
+				n2read = std::min(block_buf_size_, npleft);
+
+		if (header_.npart[5] > 0)
 			music::elog.Print("Multi-resolution setup not supported for 2comp hack");
-		
+
 		std::cout << " - Writing " << nptot << " particles to Gadget file...\n"
-				  << "      type 1 : " << header_.npart[1] << "\n"
-				  << "      type 2 : " << header_.npart[2] << "\n"
-				  << "      type 5 : " << header_.npart[5] << "\n";
-		
+							<< "      type 1 : " << header_.npart[1] << "\n"
+							<< "      type 2 : " << header_.npart[2] << "\n"
+							<< "      type 5 : " << header_.npart[5] << "\n";
+
 		bool bbaryons = np_fine_gas_ > 0;
-				
+
 		std::vector<T_store> adata3;
-		adata3.reserve( 3*block_buf_size_ );
+		adata3.reserve(3 * block_buf_size_);
 		T_store *tmp1, *tmp2, *tmp3;
-		
+
 		tmp1 = new T_store[block_buf_size_];
 		tmp2 = new T_store[block_buf_size_];
 		tmp3 = new T_store[block_buf_size_];
-		
+
 		std::vector<unsigned> nfdm_per_file, nfgas_per_file, nc_per_file;
-        distribute_particles( nfiles_, np_fine_dm_, np_fine_gas_, np_coarse_dm_,
-                             nfdm_per_file, nfgas_per_file, nc_per_file );
-		
-		
-		if( nfiles_ > 1 )
+		distribute_particles(nfiles_, np_fine_dm_, np_fine_gas_, np_coarse_dm_,
+												 nfdm_per_file, nfgas_per_file, nc_per_file);
+
+		if (nfiles_ > 1)
 		{
 			std::cout << " - Gadget2 : distributing particles to " << nfiles_ << " files\n"
-			<< "                 " << std::setw(12) << "type 1" << "," << std::setw(12) << "type 2" << "," << std::setw(12) << "type 5" << std::endl;
-			for( unsigned i=0; i<nfiles_; ++i )
+								<< "                 " << std::setw(12) << "type 1"
+								<< "," << std::setw(12) << "type 2"
+								<< "," << std::setw(12) << "type 5" << std::endl;
+			for (unsigned i = 0; i < nfiles_; ++i)
 			{
-				std::cout << "      file " << std::setw(3) << i << " : " 
-				<< std::setw(12) << nfdm_per_file[i] << "," 
-				<< std::setw(12) << nfgas_per_file[i] << "," 
-				<< std::setw(12) << nc_per_file[i] << std::endl;
-			}			
+				std::cout << "      file " << std::setw(3) << i << " : "
+									<< std::setw(12) << nfdm_per_file[i] << ","
+									<< std::setw(12) << nfgas_per_file[i] << ","
+									<< std::setw(12) << nc_per_file[i] << std::endl;
+			}
 		}
-		
-		
+
 		size_t curr_block_buf_size = block_buf_size_;
-        
-        size_t idcount = 0;
-        bool bneed_long_ids = false;
-        if( nptot >= 1ul<<32 )
-        {
-            bneed_long_ids = true;
-            music::wlog.Print("Need long particle IDs, make sure to enable in Gadget!");
-        }
-		
-		
-		for( unsigned ifile=0; ifile<nfiles_; ++ifile )
-        {
-			
-			if( nfiles_ > 1 )
+
+		size_t idcount = 0;
+		bool bneed_long_ids = false;
+		if (nptot >= 1ul << 32)
+		{
+			bneed_long_ids = true;
+			music::wlog.Print("Need long particle IDs, make sure to enable in Gadget!");
+		}
+
+		for (unsigned ifile = 0; ifile < nfiles_; ++ifile)
+		{
+
+			if (nfiles_ > 1)
 			{
 				char ffname[256];
-				sprintf(ffname,"%s.%d",fname_.c_str(), ifile);
-				ofs_.open(ffname, std::ios::binary|std::ios::trunc );
-			}else{
-				ofs_.open(fname_.c_str(), std::ios::binary|std::ios::trunc );
+				sprintf(ffname, "%s.%d", fname_.c_str(), ifile);
+				ofs_.open(ffname, std::ios::binary | std::ios::trunc);
 			}
-			
-            
+			else
+			{
+				ofs_.open(fname_.c_str(), std::ios::binary | std::ios::trunc);
+			}
+
 			size_t np_this_file = nfgas_per_file[ifile] + nfdm_per_file[ifile] + nc_per_file[ifile];
-			
+
 			int blksize = sizeof(header);
-			
+
 			//... write the header .......................................................
-			
-			header this_header( header_ );
-            this_header.npart[1] = nfdm_per_file[ifile];
-            this_header.npart[2] = nfgas_per_file[ifile];
-            this_header.npart[5] = nc_per_file[ifile];
-			
-			
-			ofs_.write( (char *)&blksize, sizeof(int) );
-			ofs_.write( (char *)&this_header, sizeof(header) );
-			ofs_.write( (char *)&blksize, sizeof(int) );
-			
-			
+
+			header this_header(header_);
+			this_header.npart[1] = nfdm_per_file[ifile];
+			this_header.npart[2] = nfgas_per_file[ifile];
+			this_header.npart[5] = nc_per_file[ifile];
+
+			ofs_.write((char *)&blksize, sizeof(int));
+			ofs_.write((char *)&this_header, sizeof(header));
+			ofs_.write((char *)&blksize, sizeof(int));
+
 			//... particle positions ..................................................
-			blksize = 3ul*np_this_file*sizeof(T_store);
-			ofs_.write( (char *)&blksize, sizeof(int) );
-			
-			npleft = nfdm_per_file[ifile];//+nc_per_file[ifile];
-			n2read = std::min(curr_block_buf_size,npleft);
-			
-			iffs1.open( fnx, npcdm, wrote_dm*sizeof(T_store) );
-			iffs2.open( fny, npcdm, wrote_dm*sizeof(T_store) );
-			iffs3.open( fnz, npcdm, wrote_dm*sizeof(T_store) );
-			
-			while( n2read > 0ul )
+			blksize = 3ul * np_this_file * sizeof(T_store);
+			ofs_.write((char *)&blksize, sizeof(int));
+
+			npleft = nfdm_per_file[ifile]; //+nc_per_file[ifile];
+			n2read = std::min(curr_block_buf_size, npleft);
+
+			iffs1.open(fnx, npcdm, wrote_dm * sizeof(T_store));
+			iffs2.open(fny, npcdm, wrote_dm * sizeof(T_store));
+			iffs3.open(fnz, npcdm, wrote_dm * sizeof(T_store));
+
+			while (n2read > 0ul)
 			{
-				iffs1.read( reinterpret_cast<char*>(&tmp1[0]), n2read*sizeof(T_store) );
-				iffs2.read( reinterpret_cast<char*>(&tmp2[0]), n2read*sizeof(T_store) );
-				iffs3.read( reinterpret_cast<char*>(&tmp3[0]), n2read*sizeof(T_store) );
-				
-				for( size_t i=0; i<n2read; ++i )
+				iffs1.read(reinterpret_cast<char *>(&tmp1[0]), n2read * sizeof(T_store));
+				iffs2.read(reinterpret_cast<char *>(&tmp2[0]), n2read * sizeof(T_store));
+				iffs3.read(reinterpret_cast<char *>(&tmp3[0]), n2read * sizeof(T_store));
+
+				for (size_t i = 0; i < n2read; ++i)
 				{
-					adata3.push_back( fmod(tmp1[i]+header_.BoxSize,header_.BoxSize) );
-					adata3.push_back( fmod(tmp2[i]+header_.BoxSize,header_.BoxSize) );
-					adata3.push_back( fmod(tmp3[i]+header_.BoxSize,header_.BoxSize) );
+					adata3.push_back(fmod(tmp1[i] + header_.BoxSize, header_.BoxSize));
+					adata3.push_back(fmod(tmp2[i] + header_.BoxSize, header_.BoxSize));
+					adata3.push_back(fmod(tmp3[i] + header_.BoxSize, header_.BoxSize));
 				}
-				ofs_.write( reinterpret_cast<char*>(&adata3[0]), 3*n2read*sizeof(T_store) );
-				
+				ofs_.write(reinterpret_cast<char *>(&adata3[0]), 3 * n2read * sizeof(T_store));
+
 				adata3.clear();
 				npleft -= n2read;
-				n2read = std::min( curr_block_buf_size,npleft );
+				n2read = std::min(curr_block_buf_size, npleft);
 			}
-			
+
 			iffs1.close();
 			iffs2.close();
 			iffs3.close();
-			
-			if( bbaryons && nfgas_per_file[ifile] > 0ul )
+
+			if (bbaryons && nfgas_per_file[ifile] > 0ul)
 			{
-				
-				iffs1.open( fnbx, npcdm, wrote_gas*sizeof(T_store) );
-				iffs2.open( fnby, npcdm, wrote_gas*sizeof(T_store) );
-				iffs3.open( fnbz, npcdm, wrote_gas*sizeof(T_store) );
-				
+
+				iffs1.open(fnbx, npcdm, wrote_gas * sizeof(T_store));
+				iffs2.open(fnby, npcdm, wrote_gas * sizeof(T_store));
+				iffs3.open(fnbz, npcdm, wrote_gas * sizeof(T_store));
+
 				npleft = nfgas_per_file[ifile];
-				n2read = std::min(curr_block_buf_size,npleft);
-				while( n2read > 0ul )
+				n2read = std::min(curr_block_buf_size, npleft);
+				while (n2read > 0ul)
 				{
-					iffs1.read( reinterpret_cast<char*>(&tmp1[0]), n2read*sizeof(T_store) );
-					iffs2.read( reinterpret_cast<char*>(&tmp2[0]), n2read*sizeof(T_store) );
-					iffs3.read( reinterpret_cast<char*>(&tmp3[0]), n2read*sizeof(T_store) );
-					
-					for( size_t i=0; i<n2read; ++i )
+					iffs1.read(reinterpret_cast<char *>(&tmp1[0]), n2read * sizeof(T_store));
+					iffs2.read(reinterpret_cast<char *>(&tmp2[0]), n2read * sizeof(T_store));
+					iffs3.read(reinterpret_cast<char *>(&tmp3[0]), n2read * sizeof(T_store));
+
+					for (size_t i = 0; i < n2read; ++i)
 					{
-						adata3.push_back( fmod(tmp1[i]+header_.BoxSize,header_.BoxSize) );
-						adata3.push_back( fmod(tmp2[i]+header_.BoxSize,header_.BoxSize) );
-						adata3.push_back( fmod(tmp3[i]+header_.BoxSize,header_.BoxSize) );
+						adata3.push_back(fmod(tmp1[i] + header_.BoxSize, header_.BoxSize));
+						adata3.push_back(fmod(tmp2[i] + header_.BoxSize, header_.BoxSize));
+						adata3.push_back(fmod(tmp3[i] + header_.BoxSize, header_.BoxSize));
 					}
-					ofs_.write( reinterpret_cast<char*>(&adata3[0]), 3*n2read*sizeof(T_store) );
-					
+					ofs_.write(reinterpret_cast<char *>(&adata3[0]), 3 * n2read * sizeof(T_store));
+
 					adata3.clear();
 					npleft -= n2read;
-					n2read = std::min( curr_block_buf_size,npleft );
+					n2read = std::min(curr_block_buf_size, npleft);
 				}
 				iffs1.close();
 				iffs2.close();
 				iffs3.close();
-				
-                
 			}
-			
-			ofs_.write( reinterpret_cast<char*>(&blksize), sizeof(int) );
-			
-			
-			
-			
+
+			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
+
 			//... particle velocities ..................................................
-			blksize = 3ul*np_this_file*sizeof(T_store);
-			ofs_.write( reinterpret_cast<char*>(&blksize), sizeof(int) );
-			
-			iffs1.open( fnvx, npcdm, wrote_dm*sizeof(T_store) );
-			iffs2.open( fnvy, npcdm, wrote_dm*sizeof(T_store) );
-			iffs3.open( fnvz, npcdm, wrote_dm*sizeof(T_store) );
-			
-			npleft = nfdm_per_file[ifile];//+nc_per_file[ifile];
-			n2read = std::min(curr_block_buf_size,npleft);
-			while( n2read > 0ul )
+			blksize = 3ul * np_this_file * sizeof(T_store);
+			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
+
+			iffs1.open(fnvx, npcdm, wrote_dm * sizeof(T_store));
+			iffs2.open(fnvy, npcdm, wrote_dm * sizeof(T_store));
+			iffs3.open(fnvz, npcdm, wrote_dm * sizeof(T_store));
+
+			npleft = nfdm_per_file[ifile]; //+nc_per_file[ifile];
+			n2read = std::min(curr_block_buf_size, npleft);
+			while (n2read > 0ul)
 			{
-				iffs1.read( reinterpret_cast<char*>(&tmp1[0]), n2read*sizeof(T_store) );
-				iffs2.read( reinterpret_cast<char*>(&tmp2[0]), n2read*sizeof(T_store) );
-				iffs3.read( reinterpret_cast<char*>(&tmp3[0]), n2read*sizeof(T_store) );
-				
-				for( size_t i=0; i<n2read; ++i )
+				iffs1.read(reinterpret_cast<char *>(&tmp1[0]), n2read * sizeof(T_store));
+				iffs2.read(reinterpret_cast<char *>(&tmp2[0]), n2read * sizeof(T_store));
+				iffs3.read(reinterpret_cast<char *>(&tmp3[0]), n2read * sizeof(T_store));
+
+				for (size_t i = 0; i < n2read; ++i)
 				{
-					adata3.push_back( tmp1[i] );
-					adata3.push_back( tmp2[i] );
-					adata3.push_back( tmp3[i] );
+					adata3.push_back(tmp1[i]);
+					adata3.push_back(tmp2[i]);
+					adata3.push_back(tmp3[i]);
 				}
-				
-				ofs_.write( reinterpret_cast<char*>(&adata3[0]), 3*n2read*sizeof(T_store) );
-				
+
+				ofs_.write(reinterpret_cast<char *>(&adata3[0]), 3 * n2read * sizeof(T_store));
+
 				adata3.clear();
 				npleft -= n2read;
-				n2read = std::min( curr_block_buf_size,npleft );
+				n2read = std::min(curr_block_buf_size, npleft);
 			}
-			
+
 			iffs1.close();
 			iffs2.close();
 			iffs3.close();
-			
-			if( bbaryons && nfgas_per_file[ifile] > 0ul )
+
+			if (bbaryons && nfgas_per_file[ifile] > 0ul)
 			{
-				iffs1.open( fnbvx, npcdm, wrote_gas*sizeof(T_store) );
-				iffs2.open( fnbvy, npcdm, wrote_gas*sizeof(T_store) );
-				iffs3.open( fnbvz, npcdm, wrote_gas*sizeof(T_store) );
-				
+				iffs1.open(fnbvx, npcdm, wrote_gas * sizeof(T_store));
+				iffs2.open(fnbvy, npcdm, wrote_gas * sizeof(T_store));
+				iffs3.open(fnbvz, npcdm, wrote_gas * sizeof(T_store));
+
 				npleft = nfgas_per_file[ifile];
-				n2read = std::min(curr_block_buf_size,npleft);
-				while( n2read > 0ul )
+				n2read = std::min(curr_block_buf_size, npleft);
+				while (n2read > 0ul)
 				{
-					iffs1.read( reinterpret_cast<char*>(&tmp1[0]), n2read*sizeof(T_store) );
-					iffs2.read( reinterpret_cast<char*>(&tmp2[0]), n2read*sizeof(T_store) );
-					iffs3.read( reinterpret_cast<char*>(&tmp3[0]), n2read*sizeof(T_store) );
-					
-					for( size_t i=0; i<n2read; ++i )
+					iffs1.read(reinterpret_cast<char *>(&tmp1[0]), n2read * sizeof(T_store));
+					iffs2.read(reinterpret_cast<char *>(&tmp2[0]), n2read * sizeof(T_store));
+					iffs3.read(reinterpret_cast<char *>(&tmp3[0]), n2read * sizeof(T_store));
+
+					for (size_t i = 0; i < n2read; ++i)
 					{
-						adata3.push_back( tmp1[i] );
-						adata3.push_back( tmp2[i] );
-						adata3.push_back( tmp3[i] );
+						adata3.push_back(tmp1[i]);
+						adata3.push_back(tmp2[i]);
+						adata3.push_back(tmp3[i]);
 					}
-					
-					ofs_.write( reinterpret_cast<char*>(&adata3[0]), 3*n2read*sizeof(T_store) );
-					
+
+					ofs_.write(reinterpret_cast<char *>(&adata3[0]), 3 * n2read * sizeof(T_store));
+
 					adata3.clear();
 					npleft -= n2read;
-					n2read = std::min( curr_block_buf_size,npleft );
+					n2read = std::min(curr_block_buf_size, npleft);
 				}
-				
+
 				iffs1.close();
 				iffs2.close();
 				iffs3.close();
-				
-				
 			}
-			
-			ofs_.write( reinterpret_cast<char*>(&blksize), sizeof(int) );
-			
+
+			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
+
 			//... particle IDs ..........................................................
 			std::vector<unsigned> short_ids;
-            std::vector<size_t> long_ids;
-            
-            if( bneed_long_ids )
-                long_ids.assign(curr_block_buf_size,0);
+			std::vector<size_t> long_ids;
+
+			if (bneed_long_ids)
+				long_ids.assign(curr_block_buf_size, 0);
 			else
-                short_ids.assign(curr_block_buf_size,0);
-			
-			npleft	= np_this_file;
-			n2read	= std::min(curr_block_buf_size,npleft);
-			blksize = sizeof(unsigned)*np_this_file;
-            
-            if( bneed_long_ids )
-                blksize = sizeof(size_t)*np_this_file;
-			
-			
+				short_ids.assign(curr_block_buf_size, 0);
+
+			npleft = np_this_file;
+			n2read = std::min(curr_block_buf_size, npleft);
+			blksize = sizeof(unsigned) * np_this_file;
+
+			if (bneed_long_ids)
+				blksize = sizeof(size_t) * np_this_file;
+
 			//... generate contiguous IDs and store in file ..
-			ofs_.write( reinterpret_cast<char*>(&blksize), sizeof(int) );
-			while( n2read > 0ul )
+			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
+			while (n2read > 0ul)
 			{
-                if( bneed_long_ids )
-                {
-					for( size_t i=0; i<n2read; ++i )
+				if (bneed_long_ids)
+				{
+					for (size_t i = 0; i < n2read; ++i)
 						long_ids[i] = idcount++;
-					ofs_.write( reinterpret_cast<char*>(&long_ids[0]), n2read*sizeof(size_t) );
-                }else{
-					for( size_t i=0; i<n2read; ++i )
+					ofs_.write(reinterpret_cast<char *>(&long_ids[0]), n2read * sizeof(size_t));
+				}
+				else
+				{
+					for (size_t i = 0; i < n2read; ++i)
 						short_ids[i] = idcount++;
-					ofs_.write( reinterpret_cast<char*>(&short_ids[0]), n2read*sizeof(unsigned) );
-                }
-                npleft -= n2read;
-				n2read = std::min( curr_block_buf_size,npleft );
+					ofs_.write(reinterpret_cast<char *>(&short_ids[0]), n2read * sizeof(unsigned));
+				}
+				npleft -= n2read;
+				n2read = std::min(curr_block_buf_size, npleft);
 			}
-			ofs_.write( reinterpret_cast<char*>(&blksize), sizeof(int) );
-			
-			std::vector<unsigned>().swap( short_ids );
-			std::vector<size_t>().swap( long_ids );
-			
-			
+			ofs_.write(reinterpret_cast<char *>(&blksize), sizeof(int));
+
+			std::vector<unsigned>().swap(short_ids);
+			std::vector<size_t>().swap(long_ids);
+
 			//... particle masses .......................................................
 			// multi-mass not supported here
-			
-			
+
 			ofs_.flush();
 			ofs_.close();
-			
-            wrote_gas       += nfgas_per_file[ifile];
-            wrote_dm        += nfdm_per_file[ifile] + nc_per_file[ifile];
-            wrote_coarse    += nc_per_file[ifile];
-			
-            
+
+			wrote_gas += nfgas_per_file[ifile];
+			wrote_dm += nfdm_per_file[ifile] + nc_per_file[ifile];
+			wrote_coarse += nc_per_file[ifile];
 		}
-        
-        delete[] tmp1;
+
+		delete[] tmp1;
 		delete[] tmp2;
-        delete[] tmp3;
-        
-        remove( fnbx );
-        remove( fnby );
-        remove( fnbz );
-        remove( fnx );
-        remove( fny );
-        remove( fnz );
-        remove( fnbvx );
-        remove( fnbvy );
-        remove( fnbvz );
-        remove( fnvx );
-        remove( fnvy );
-        remove( fnvz );
-        remove( fnm );
+		delete[] tmp3;
+
+		remove(fnbx);
+		remove(fnby);
+		remove(fnbz);
+		remove(fnx);
+		remove(fny);
+		remove(fnz);
+		remove(fnbvx);
+		remove(fnbvy);
+		remove(fnbvz);
+		remove(fnvx);
+		remove(fnvy);
+		remove(fnvz);
+		remove(fnm);
 	}
-	
-	void get_cic_displacement( size_t icoord, const float* ppos, size_t np, float l, const grid_hierarchy& gh, T_store* valp )
+
+	void get_cic_displacement(size_t icoord, const float *ppos, size_t np, float l, const grid_hierarchy &gh, T_store *valp)
 	{
 		size_t N = gh.size(gh.levelmax(), 0);
-		
-		float facconv   = 1.f / l * (float)N/(float)(1ul<<levelmax_);
-		float suboffset = (float)(gh.offset_abs(levelmax_, icoord))/((float)(1ul<<levelmax_));
-		
-		for( size_t ip=0; ip < np; ++ip )
+
+		float facconv = 1.f / l * (float)N / (float)(1ul << levelmax_);
+		float suboffset = (float)(gh.offset_abs(levelmax_, icoord)) / ((float)(1ul << levelmax_));
+
+		for (size_t ip = 0; ip < np; ++ip)
 		{
-			float u,v,w;
-			
-			u = ppos[3*ip+0] / l * (float)N;
-			v = ppos[3*ip+1] / l * (float)N;
-			w = ppos[3*ip+2] / l * (float)N;
-			
-			int i,j,k;
-			
-			i = (((int)u)+N)%N;
-			j = (((int)v)+N)%N;
-			k = (((int)w)+N)%N;
-			
+			float u, v, w;
+
+			u = ppos[3 * ip + 0] / l * (float)N;
+			v = ppos[3 * ip + 1] / l * (float)N;
+			w = ppos[3 * ip + 2] / l * (float)N;
+
+			int i, j, k;
+
+			i = (((int)u) + N) % N;
+			j = (((int)v) + N) % N;
+			k = (((int)w) + N) % N;
+
 			u -= (float)i;
 			v -= (float)j;
 			w -= (float)k;
-			
-			int i1,j1,k1;
-			i1 = (i+1+N)%N;
-			j1 = (j+1+N)%N;
-			k1 = (k+1+N)%N;
-			
-			double f1,f2,f3,f4,f5,f6,f7,f8;
-			
+
+			int i1, j1, k1;
+			i1 = (i + 1 + N) % N;
+			j1 = (j + 1 + N) % N;
+			k1 = (k + 1 + N) % N;
+
+			double f1, f2, f3, f4, f5, f6, f7, f8;
+
 			f1 = (1.f - u) * (1.f - v) * (1.f - w);
 			f2 = (1.f - u) * (1.f - v) * (w);
 			f3 = (1.f - u) * (v) * (1.f - w);
 			f4 = (1.f - u) * (v) * (w);
 			f5 = (u) * (1.f - v) * (1.f - w);
-			f6 = (u) * (1.f - v) * (w); 
+			f6 = (u) * (1.f - v) * (w);
 			f7 = (u) * (v) * (1.f - w);
 			f8 = (u) * (v) * (w);
-			
+
 			float disp = 0.0f;
-			
-			//disp += add_term;
+
+			// disp += add_term;
 			disp += suboffset;
-			disp += ppos[3*ip+icoord] * facconv;
-			
-			disp += f1*(*gh.get_grid(levelmax_))(i,j,k);
-			disp += f2*(*gh.get_grid(levelmax_))(i,j,k1);
-			disp += f3*(*gh.get_grid(levelmax_))(i,j1,k);
-			disp += f4*(*gh.get_grid(levelmax_))(i,j1,k1);
-			disp += f5*(*gh.get_grid(levelmax_))(i1,j,k);
-			disp += f6*(*gh.get_grid(levelmax_))(i1,j,k1);
-			disp += f7*(*gh.get_grid(levelmax_))(i1,j1,k);
-			disp += f8*(*gh.get_grid(levelmax_))(i1,j1,k1);
-			
-			
-			
-			disp = fmodf( (1.0f+disp)*header_.BoxSize, header_.BoxSize );
-			
+			disp += ppos[3 * ip + icoord] * facconv;
+
+			disp += f1 * (*gh.get_grid(levelmax_))(i, j, k);
+			disp += f2 * (*gh.get_grid(levelmax_))(i, j, k1);
+			disp += f3 * (*gh.get_grid(levelmax_))(i, j1, k);
+			disp += f4 * (*gh.get_grid(levelmax_))(i, j1, k1);
+			disp += f5 * (*gh.get_grid(levelmax_))(i1, j, k);
+			disp += f6 * (*gh.get_grid(levelmax_))(i1, j, k1);
+			disp += f7 * (*gh.get_grid(levelmax_))(i1, j1, k);
+			disp += f8 * (*gh.get_grid(levelmax_))(i1, j1, k1);
+
+			disp = fmodf((1.0f + disp) * header_.BoxSize, header_.BoxSize);
+
 			valp[ip] = disp;
-			
 		}
 	}
-	
-	void get_cic_velocity( const float* ppos, size_t np, float l, const grid_hierarchy& gh, T_store* valp )
+
+	void get_cic_velocity(const float *ppos, size_t np, float l, const grid_hierarchy &gh, T_store *valp)
 	{
-		float isqrta = 1.0f/sqrt(header_.time);
-		float vfac = isqrta*header_.BoxSize;
-		
-		if( kpcunits_ )
+		float isqrta = 1.0f / sqrt(header_.time);
+		float vfac = isqrta * header_.BoxSize;
+
+		if (kpcunits_)
 			vfac /= 1000.0;
-		
+
 		size_t N = gh.size(gh.levelmax(), 0);
-		//float facconv   = 1.f / l * (float)N/(float)(1ul<<levelmax_);
-		
-		for( size_t ip=0; ip < np; ++ip )
+		// float facconv   = 1.f / l * (float)N/(float)(1ul<<levelmax_);
+
+		for (size_t ip = 0; ip < np; ++ip)
 		{
-			float u,v,w;
-			
-			u = ppos[3*ip+0] / l * (float)N;
-			v = ppos[3*ip+1] / l * (float)N;
-			w = ppos[3*ip+2] / l * (float)N;
-			
-			int i,j,k;
-			
-			i = (((int)u)+N)%N;
-			j = (((int)v)+N)%N;
-			k = (((int)w)+N)%N;
-			
+			float u, v, w;
+
+			u = ppos[3 * ip + 0] / l * (float)N;
+			v = ppos[3 * ip + 1] / l * (float)N;
+			w = ppos[3 * ip + 2] / l * (float)N;
+
+			int i, j, k;
+
+			i = (((int)u) + N) % N;
+			j = (((int)v) + N) % N;
+			k = (((int)w) + N) % N;
+
 			u -= (float)i;
 			v -= (float)j;
 			w -= (float)k;
-			
-			int i1,j1,k1;
-			i1 = (i+1+N)%N;
-			j1 = (j+1+N)%N;
-			k1 = (k+1+N)%N;
-			
-			double f1,f2,f3,f4,f5,f6,f7,f8;
-			
+
+			int i1, j1, k1;
+			i1 = (i + 1 + N) % N;
+			j1 = (j + 1 + N) % N;
+			k1 = (k + 1 + N) % N;
+
+			double f1, f2, f3, f4, f5, f6, f7, f8;
+
 			f1 = (1.f - u) * (1.f - v) * (1.f - w);
 			f2 = (1.f - u) * (1.f - v) * (w);
 			f3 = (1.f - u) * (v) * (1.f - w);
 			f4 = (1.f - u) * (v) * (w);
 			f5 = (u) * (1.f - v) * (1.f - w);
-			f6 = (u) * (1.f - v) * (w); 
+			f6 = (u) * (1.f - v) * (w);
 			f7 = (u) * (v) * (1.f - w);
 			f8 = (u) * (v) * (w);
-			
+
 			float vel = 0.0f;
-			
-			vel += f1*(*gh.get_grid(levelmax_))(i,j,k);
-			vel += f2*(*gh.get_grid(levelmax_))(i,j,k1);
-			vel += f3*(*gh.get_grid(levelmax_))(i,j1,k);
-			vel += f4*(*gh.get_grid(levelmax_))(i,j1,k1);
-			vel += f5*(*gh.get_grid(levelmax_))(i1,j,k);
-			vel += f6*(*gh.get_grid(levelmax_))(i1,j,k1);
-			vel += f7*(*gh.get_grid(levelmax_))(i1,j1,k);
-			vel += f8*(*gh.get_grid(levelmax_))(i1,j1,k1);
-			
-			vel *= vfac;			
-			
+
+			vel += f1 * (*gh.get_grid(levelmax_))(i, j, k);
+			vel += f2 * (*gh.get_grid(levelmax_))(i, j, k1);
+			vel += f3 * (*gh.get_grid(levelmax_))(i, j1, k);
+			vel += f4 * (*gh.get_grid(levelmax_))(i, j1, k1);
+			vel += f5 * (*gh.get_grid(levelmax_))(i1, j, k);
+			vel += f6 * (*gh.get_grid(levelmax_))(i1, j, k1);
+			vel += f7 * (*gh.get_grid(levelmax_))(i1, j1, k);
+			vel += f8 * (*gh.get_grid(levelmax_))(i1, j1, k1);
+
+			vel *= vfac;
+
 			valp[ip] = vel;
-			
 		}
 	}
-	
-	
+
 public:
-	
 	bool do_baryons_;
 	double omegab_, omegac_, omegam_;
 	double gamma_;
-	
-	gadget2_2comp_output_plugin( config_file& cf )
-	: output_plugin( cf )//, ofs_( fname_.c_str(), std::ios::binary|std::ios::trunc )	
+
+	gadget2_2comp_output_plugin(config_file &cf)
+			: output_plugin(cf) //, ofs_( fname_.c_str(), std::ios::binary|std::ios::trunc )
 	{
-		block_buf_size_ = cf_.get_value_safe<unsigned>("output","gadget_blksize",2*1048576);
-		
+		block_buf_size_ = cf_.get_value_safe<unsigned>("output", "gadget_blksize", 2 * 1048576);
+
 		//... ensure that everyone knows we want to do SPH
-		cf.insert_value("setup","do_SPH","yes");
-		
-		//bbndparticles_  = !cf_.get_value_safe<bool>("output","gadget_nobndpart",false);
-		npartmax_ = 1<<30;
-		
-		nfiles_ = cf.get_value_safe<unsigned>("output","gadget_num_files",1);
-		
-		
-		
+		cf.insert_value("setup", "do_SPH", "yes");
+
+		// bbndparticles_  = !cf_.get_value_safe<bool>("output","gadget_nobndpart",false);
+		npartmax_ = 1 << 30;
+
+		nfiles_ = cf.get_value_safe<unsigned>("output", "gadget_num_files", 1);
+
 		/****************************************/
-		if (nfiles_ > 1 ) 
+		if (nfiles_ > 1)
 		{
-			for( unsigned ifile=0; ifile<nfiles_; ++ifile )
+			for (unsigned ifile = 0; ifile < nfiles_; ++ifile)
 			{
 				char ffname[256];
-				sprintf(ffname,"%s.%d",fname_.c_str(), ifile);
-				ofs_.open(ffname, std::ios::binary|std::ios::trunc );
-				if(!ofs_.good())
-				{	
-					music::elog.Print("gadget-2 output plug-in could not open output file \'%s\' for writing!",ffname);
-					throw std::runtime_error(std::string("gadget-2 output plug-in could not open output file \'")+std::string(ffname)+"\' for writing!\n");
+				sprintf(ffname, "%s.%d", fname_.c_str(), ifile);
+				ofs_.open(ffname, std::ios::binary | std::ios::trunc);
+				if (!ofs_.good())
+				{
+					music::elog.Print("gadget-2 output plug-in could not open output file \'%s\' for writing!", ffname);
+					throw std::runtime_error(std::string("gadget-2 output plug-in could not open output file \'") + std::string(ffname) + "\' for writing!\n");
 				}
-				ofs_.close();	
+				ofs_.close();
 			}
-		}else{
-			ofs_.open(fname_.c_str(), std::ios::binary|std::ios::trunc );
-			if(!ofs_.good())
-			{	
-				music::elog.Print("gadget-2 output plug-in could not open output file \'%s\' for writing!",fname_.c_str());
-				throw std::runtime_error(std::string("gadget-2 output plug-in could not open output file \'")+fname_+"\' for writing!\n");
+		}
+		else
+		{
+			ofs_.open(fname_.c_str(), std::ios::binary | std::ios::trunc);
+			if (!ofs_.good())
+			{
+				music::elog.Print("gadget-2 output plug-in could not open output file \'%s\' for writing!", fname_.c_str());
+				throw std::runtime_error(std::string("gadget-2 output plug-in could not open output file \'") + fname_ + "\' for writing!\n");
 			}
 			ofs_.close();
 		}
-		
+
 		bmorethan2bnd_ = false;
-		if( levelmax_ > levelmin_ +1)
+		if (levelmax_ > levelmin_ + 1)
 			bmorethan2bnd_ = true;
 
 		bmultimass_ = true;
-		if( levelmax_ == levelmin_ )
+		if (levelmax_ == levelmin_)
 			bmultimass_ = false;
-			
-		
-		for( int i=0; i<6; ++i )
+
+		for (int i = 0; i < 6; ++i)
 		{
 			header_.npart[i] = 0;
 			header_.npartTotal[i] = 0;
 			header_.npartTotalHighWord[i] = 0;
 			header_.mass[i] = 0.0;
 		}
-		
-		YHe_ = cf.get_value_safe<double>("cosmology","YHe",0.248);
-		gamma_ = cf.get_value_safe<double>("cosmology","gamma",5.0/3.0);
-		
-		do_baryons_ = cf.get_value_safe<bool>("setup","baryons",false);
-		omegab_ = cf.get_value_safe<double>("cosmology","Omega_b",0.045);
-		
+
+		YHe_ = cf.get_value_safe<double>("cosmology", "YHe", 0.248);
+		gamma_ = cf.get_value_safe<double>("cosmology", "gamma", 5.0 / 3.0);
+
+		do_baryons_ = cf.get_value_safe<bool>("setup", "baryons", false);
+		omegab_ = cf.get_value_safe<double>("cosmology", "Omega_b", 0.045);
+
 		//... write displacements in kpc/h rather than Mpc/h?
-		kpcunits_ = cf.get_value_safe<bool>("output","gadget_usekpc",false);
-		
-		do_glass_ = cf.get_value_safe<bool>("output","glass", false);
-		if( do_glass_ )
+		kpcunits_ = cf.get_value_safe<bool>("output", "gadget_usekpc", false);
+
+		do_glass_ = cf.get_value_safe<bool>("output", "glass", false);
+		if (do_glass_)
 		{
 			music::ilog.Print("Will use provided glass rather than Cartesian mesh for particle placement.");
-			
-			fname_glass_cdm_ = cf.get_value<std::string>("output","glass_file_cdm");
-			
-			if( do_baryons_ )
-				fname_glass_baryon_ = fname_glass_cdm_;//cf.get_value<std::string>("output","glass_file_baryon");
+
+			fname_glass_cdm_ = cf.get_value<std::string>("output", "glass_file_cdm");
+
+			if (do_baryons_)
+				fname_glass_baryon_ = fname_glass_cdm_; // cf.get_value<std::string>("output","glass_file_baryon");
 		}
-		
-		
+
 		//... set time ......................................................
-		header_.redshift = cf.get_value<double>("setup","zstart");
-		header_.time = 1.0/(1.0+header_.redshift);
-		
+		header_.redshift = cf.get_value<double>("setup", "zstart");
+		header_.time = 1.0 / (1.0 + header_.redshift);
+
 		//... SF flags
 		header_.flag_sfr = 0;
 		header_.flag_feedback = 0;
 		header_.flag_cooling = 0;
-		
-		//... 
+
+		//...
 		header_.num_files = nfiles_;
-		header_.BoxSize = cf.get_value<double>("setup","boxlength");
-		header_.Omega0 = cf.get_value<double>("cosmology","Omega_m");
-                omegam_ = header_.Omega0;
-                omegac_ = omegam_ - omegab_;
-        
-		header_.OmegaLambda = cf.get_value<double>("cosmology","Omega_L");
-		header_.HubbleParam = cf.get_value<double>("cosmology","H0");
-		
+		header_.BoxSize = cf.get_value<double>("setup", "boxlength");
+		header_.Omega0 = cf.get_value<double>("cosmology", "Omega_m");
+		omegam_ = header_.Omega0;
+		omegac_ = omegam_ - omegab_;
+
+		header_.OmegaLambda = cf.get_value<double>("cosmology", "Omega_L");
+		header_.HubbleParam = cf.get_value<double>("cosmology", "H0");
+
 		header_.flag_stellarage = 0;
 		header_.flag_metals = 0;
-		
-	        	
+
 		header_.flag_entropy_instead_u = 0;
-#ifdef SINGLE_PRECISION
-                header_.flag_doubleprecision = 0;
-#else
-                header_.flag_doubleprecision = 1; 
-#endif
-                std::cout << "header_.flag_doubleprecision " << header_.flag_doubleprecision << "\n";
-                header_.flag_ic_info = 0;
-		
-		if( kpcunits_ )
+
+		header_.flag_doubleprecision = typeid(real_t) == typeid(double) ? 1 : 0;
+		std::cout << "header_.flag_doubleprecision " << header_.flag_doubleprecision << "\n";
+		header_.flag_ic_info = 0;
+
+		if (kpcunits_)
 			header_.BoxSize *= 1000.0;
 	}
-	
-	
-	void write_dm_mass( const grid_hierarchy& gh )
+
+	void write_dm_mass(const grid_hierarchy &gh)
 	{
 		double rhoc = 27.7519737; // in h^2 1e10 M_sol / Mpc^3
-		
-		if( kpcunits_ )
+
+		if (kpcunits_)
 			rhoc *= 10.0; // in h^2 M_sol / kpc^3
-		
-		if(! do_glass_ )
+
+		if (!do_glass_)
 		{
-			if( !do_baryons_ )
-				header_.mass[1] = header_.Omega0 * rhoc * pow(header_.BoxSize,3.)/pow(2,3*levelmax_);
+			if (!do_baryons_)
+				header_.mass[1] = header_.Omega0 * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * levelmax_);
 			else
-				header_.mass[1] = (header_.Omega0-omegab_) * rhoc * pow(header_.BoxSize,3.)/pow(2,3*levelmax_);			
+				header_.mass[1] = (header_.Omega0 - omegab_) * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * levelmax_);
 		}
-				
-		if( bmorethan2bnd_ )
+
+		if (bmorethan2bnd_)
 		{
-			unsigned long long npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax()-1);
+			unsigned long long npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax() - 1);
 			unsigned long long nwritten = 0;
-			
+
 			std::vector<T_store> temp_dat;
 			temp_dat.reserve(block_buf_size_);
-			
+
 			char temp_fname[256];
-			sprintf( temp_fname, "___ic_temp_%05d.bin", 100*id_dm_mass );
-			std::ofstream ofs_temp( temp_fname, std::ios::binary|std::ios::trunc );
-			
-			unsigned long long blksize = sizeof(T_store)*npcoarse;
-			
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			for( int ilevel=gh.levelmax()-1; ilevel>=(int)gh.levelmin(); --ilevel )
+			sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_dm_mass);
+			std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
+
+			unsigned long long blksize = sizeof(T_store) * npcoarse;
+
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			for (int ilevel = gh.levelmax() - 1; ilevel >= (int)gh.levelmin(); --ilevel)
 			{
 				double pmass = 0.0;
-				
-				if( !do_baryons_ )
-					pmass = header_.Omega0 * rhoc * pow(header_.BoxSize,3.)/pow(2,3*ilevel);		
+
+				if (!do_baryons_)
+					pmass = header_.Omega0 * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * ilevel);
 				else
-					pmass = (header_.Omega0-omegab_) * rhoc * pow(header_.BoxSize,3.)/pow(2,3*ilevel);
-					
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
+					pmass = (header_.Omega0 - omegab_) * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * ilevel);
+
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
 							{
-								if( temp_dat.size() <  block_buf_size_ )
-									temp_dat.push_back( pmass );	
+								if (temp_dat.size() < block_buf_size_)
+									temp_dat.push_back(pmass);
 								else
 								{
-									ofs_temp.write( (char*)&temp_dat[0], sizeof(T_store)*block_buf_size_ );	
+									ofs_temp.write((char *)&temp_dat[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_dat.clear();
-									temp_dat.push_back( pmass );	
+									temp_dat.push_back(pmass);
 								}
 							}
 			}
-			
-			if( temp_dat.size() > 0 )
-			{	
-				ofs_temp.write( (char*)&temp_dat[0], sizeof(T_store)*temp_dat.size() );		
-				nwritten+=temp_dat.size();
+
+			if (temp_dat.size() > 0)
+			{
+				ofs_temp.write((char *)&temp_dat[0], sizeof(T_store) * temp_dat.size());
+				nwritten += temp_dat.size();
 			}
-			
-			if( nwritten != npcoarse )
+
+			if (nwritten != npcoarse)
 				throw std::runtime_error("Internal consistency error while writing temporary file for masses");
-			
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			if( ofs_temp.bad() )
+
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			if (ofs_temp.bad())
 				throw std::runtime_error("I/O error while writing temporary file for masses");
-			
 		}
-		else if( gh.levelmax() != gh.levelmin() )
+		else if (gh.levelmax() != gh.levelmin())
 		{
-			header_.mass[5] = header_.Omega0 * rhoc * pow(header_.BoxSize,3.)/pow(2,3*levelmin_);
+			header_.mass[5] = header_.Omega0 * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * levelmin_);
 		}
 	}
-	
-	
-	void write_dm_position( int coord, const grid_hierarchy& gh )
+
+	void write_dm_position(int coord, const grid_hierarchy &gh)
 	{
 		//... count number of leaf cells ...//
 		unsigned long long npcoarse = 0, npfine = 0;
-		
-		npfine   = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
-		if( bmultimass_ )
-			npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax()-1);
-		
-		
-		np_fine_dm_   = npfine;
-        np_fine_gas_  = do_baryons_? npfine : 0ul;
-        np_coarse_dm_ = npcoarse;
-		
+
+		npfine = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
+		if (bmultimass_)
+			npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax() - 1);
+
+		np_fine_dm_ = npfine;
+		np_fine_gas_ = do_baryons_ ? npfine : 0ul;
+		np_coarse_dm_ = npcoarse;
+
 		//... determine if we need to shift the coordinates back
 		double *shift = NULL;
-		
-		if( cf_.get_value_safe<bool>("output","shift_back",false ) )
+
+		if (cf_.get_value_safe<bool>("output", "shift_back", false))
 		{
-			if( coord == 0 )
+			if (coord == 0)
 				std::cout << " - gadget2 output plug-in will shift particle positions back...\n";
-			
-			double h = 1.0/(1<<levelmin_);
+
+			double h = 1.0 / (1 << levelmin_);
 			shift = new double[3];
-			shift[0] = -(double)cf_.get_value<int>( "setup", "shift_x" )*h;
-			shift[1] = -(double)cf_.get_value<int>( "setup", "shift_y" )*h;
-			shift[2] = -(double)cf_.get_value<int>( "setup", "shift_z" )*h;
+			shift[0] = -(double)cf_.get_value<int>("setup", "shift_x") * h;
+			shift[1] = -(double)cf_.get_value<int>("setup", "shift_y") * h;
+			shift[2] = -(double)cf_.get_value<int>("setup", "shift_z") * h;
 		}
-		
-		size_t npart = npfine+npcoarse;
+
+		size_t npart = npfine + npcoarse;
 		size_t nwritten = 0;
 		size_t blksize;
-		
+
 		//...
 		header_.npart[1] = npfine;
 		header_.npart[5] = npcoarse;
 		header_.npartTotal[1] = (unsigned)npfine;
 		header_.npartTotal[5] = (unsigned)npcoarse;
-		header_.npartTotalHighWord[1] = (unsigned)(npfine>>32);
-		header_.npartTotalHighWord[5] = (unsigned)(npfine>>32);
-		
-		//header_.num_files = (int)ceil((double)npart/(double)npartmax_);
-		
+		header_.npartTotalHighWord[1] = (unsigned)(npfine >> 32);
+		header_.npartTotalHighWord[5] = (unsigned)(npfine >> 32);
+
+		// header_.num_files = (int)ceil((double)npart/(double)npartmax_);
+
 		//... collect displacements and convert to absolute coordinates with correct
 		//... units
 		std::vector<T_store> temp_data;
-		temp_data.reserve( block_buf_size_ );
-		
+		temp_data.reserve(block_buf_size_);
+
 		double xfac = header_.BoxSize;
-		
+
 		char temp_fname[256];
-		sprintf( temp_fname, "___ic_temp_%05d.bin", 100*id_dm_pos+coord );
-		std::ofstream ofs_temp( temp_fname, std::ios::binary|std::ios::trunc );
-        
-        //... if baryons are present, then stagger the two fields
-        if( do_baryons_ && !do_glass_ )
-        {
-			
-			
-            double h = 1. / (1<<gh.levelmax());
-            
-            if( shift == NULL )
-            {
-                shift = new double[3];
-                shift[0] = 0.0;
-                shift[1] = 0.0;
-                shift[2] = 0.0;
-                
-            }
-            shift[0] -= 0.5 * h * omegab_/omegam_;
-            shift[1] -= 0.5 * h * omegab_/omegam_;
-            shift[2] -= 0.5 * h * omegab_/omegam_;
-        }
-		
-		if(! do_glass_ )
+		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_dm_pos + coord);
+		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
+
+		//... if baryons are present, then stagger the two fields
+		if (do_baryons_ && !do_glass_)
 		{
-			
-			blksize = sizeof(T_store)*npart;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			for( int ilevel=gh.levelmax(); ilevel>=(int)gh.levelmin(); --ilevel )
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
+
+			double h = 1. / (1 << gh.levelmax());
+
+			if (shift == NULL)
+			{
+				shift = new double[3];
+				shift[0] = 0.0;
+				shift[1] = 0.0;
+				shift[2] = 0.0;
+			}
+			shift[0] -= 0.5 * h * omegab_ / omegam_;
+			shift[1] -= 0.5 * h * omegab_ / omegam_;
+			shift[2] -= 0.5 * h * omegab_ / omegam_;
+		}
+
+		if (!do_glass_)
+		{
+
+			blksize = sizeof(T_store) * npart;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			for (int ilevel = gh.levelmax(); ilevel >= (int)gh.levelmin(); --ilevel)
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
 							{
 								double xx[3];
 								gh.cell_pos(ilevel, i, j, k, xx);
-								if( shift != NULL )
+								if (shift != NULL)
 									xx[coord] += shift[coord];
-								
-								xx[coord] = fmod( (xx[coord]+(*gh.get_grid(ilevel))(i,j,k))*xfac + header_.BoxSize, header_.BoxSize );
-								
-								if( temp_data.size() < block_buf_size_ )
-									temp_data.push_back( xx[coord] );
+
+								xx[coord] = fmod((xx[coord] + (*gh.get_grid(ilevel))(i, j, k)) * xfac + header_.BoxSize, header_.BoxSize);
+
+								if (temp_data.size() < block_buf_size_)
+									temp_data.push_back(xx[coord]);
 								else
 								{
-									ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+									ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_data.clear();
-									temp_data.push_back( xx[coord] );
+									temp_data.push_back(xx[coord]);
 								}
 							}
 		}
 		else
 		{
-			
-			std::ifstream ofg( fname_glass_cdm_.c_str(), std::ios::binary );
-			
-			if( !ofg.good() )
-				music::elog.Print("could not open glass input file \'%s\'",fname_glass_cdm_.c_str());
-			
+
+			std::ifstream ofg(fname_glass_cdm_.c_str(), std::ios::binary);
+
+			if (!ofg.good())
+				music::elog.Print("could not open glass input file \'%s\'", fname_glass_cdm_.c_str());
+
 			io_header glasshead;
 			unsigned blksz;
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			assert( blksz == sizeof(io_header) );
-			
-			ofg.read( reinterpret_cast<char*>(&glasshead), sizeof( io_header ) );
-			
-			//size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			assert(blksz == sizeof(io_header));
+
+			ofg.read(reinterpret_cast<char *>(&glasshead), sizeof(io_header));
+
+			// size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
 			/*if( nreq != (size_t)glasshead.npart[1] )
 			{
 				music::elog.Print("glass file contains %d particles, but should contain %ld",glasshead.npart[1],nreq);
 				throw std::runtime_error("glass file does not contain the right amount of particles");
 			}*/
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			//assert( blksz == glasshead.npart[1]*sizeof(float)*3 );
-			
-			
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			// assert( blksz == glasshead.npart[1]*sizeof(float)*3 );
+
 			float lglass = glasshead.BoxSize;
-			
-			
-			blksize = sizeof(T_store)*glasshead.npart[1];
-			//ofs_temp.write( (char *)&blksize, sizeof(int) );
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-						
+
+			blksize = sizeof(T_store) * glasshead.npart[1];
+			// ofs_temp.write( (char *)&blksize, sizeof(int) );
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
 			header_.npart[1] = glasshead.npart[1];
 			header_.npartTotal[1] = glasshead.npartTotal[1];
 			header_.npartTotalHighWord[1] = 0;
 
 			double rhoc = 27.7519737;
-			if( kpcunits_ )
+			if (kpcunits_)
 				rhoc *= 10.0; // in h^2 M_sol / kpc^3
-			
-            if( do_baryons_ )
-                header_.mass[1] = omegac_ * rhoc * pow(header_.BoxSize,3.)/(glasshead.npart[1]);
-			else
-                header_.mass[1] = omegam_ * rhoc * pow(header_.BoxSize,3.)/(glasshead.npart[1]);
-            
-			// read glass, do interpolation and write
-			size_t npartdone=0;
-			size_t npinter = glasshead.npart[1];
-			
-			blksize = sizeof(T_store)*npinter;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			float *pos_tmp = new float[3*block_buf_size_];
-			temp_data.assign( block_buf_size_, 0.0 );
-			
-			while( npartdone < npinter )
-			{
-				size_t npart2read = std::min(npinter-npartdone,block_buf_size_);
-				
-				ofg.read( reinterpret_cast<char*>(&pos_tmp[0]), npart2read*sizeof(float)*3 );
-				get_cic_displacement( coord, pos_tmp, npart2read, lglass,  gh, &temp_data[0] );
-				ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*npart2read );
 
-                        //        std::cout << "npart2read " << npart2read << "\n";
-                        //        std::cout << "pos temp " << temp_data[0] << " " << temp_data[1] << " " << temp_data[2] << "\n";
-                        //        std::cout << "pos temp " << temp_data[npart2read-1] << " " << temp_data[npart2read-2] << " " << temp_data[npart2read-3] <<"\n";
-				
+			if (do_baryons_)
+				header_.mass[1] = omegac_ * rhoc * pow(header_.BoxSize, 3.) / (glasshead.npart[1]);
+			else
+				header_.mass[1] = omegam_ * rhoc * pow(header_.BoxSize, 3.) / (glasshead.npart[1]);
+
+			// read glass, do interpolation and write
+			size_t npartdone = 0;
+			size_t npinter = glasshead.npart[1];
+
+			blksize = sizeof(T_store) * npinter;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			float *pos_tmp = new float[3 * block_buf_size_];
+			temp_data.assign(block_buf_size_, 0.0);
+
+			while (npartdone < npinter)
+			{
+				size_t npart2read = std::min(npinter - npartdone, block_buf_size_);
+
+				ofg.read(reinterpret_cast<char *>(&pos_tmp[0]), npart2read * sizeof(float) * 3);
+				get_cic_displacement(coord, pos_tmp, npart2read, lglass, gh, &temp_data[0]);
+				ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * npart2read);
+
+				//        std::cout << "npart2read " << npart2read << "\n";
+				//        std::cout << "pos temp " << temp_data[0] << " " << temp_data[1] << " " << temp_data[2] << "\n";
+				//        std::cout << "pos temp " << temp_data[npart2read-1] << " " << temp_data[npart2read-2] << " " << temp_data[npart2read-3] <<"\n";
+
 				npartdone += npart2read;
 				nwritten += npart2read;
-
 			}
 
 			delete[] pos_tmp;
 			temp_data.clear();
-								
+
 			// do all lower levels with standard cartesian grid
-			for( int ilevel=gh.levelmax()-1; ilevel>=(int)gh.levelmin(); --ilevel )
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
+			for (int ilevel = gh.levelmax() - 1; ilevel >= (int)gh.levelmin(); --ilevel)
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
 							{
 								double xx[3];
 								gh.cell_pos(ilevel, i, j, k, xx);
-								if( shift != NULL )
+								if (shift != NULL)
 									xx[coord] += shift[coord];
-								
-								xx[coord] = fmod( (xx[coord]+(*gh.get_grid(ilevel))(i,j,k))*xfac + header_.BoxSize, header_.BoxSize );
-								
-								if( temp_data.size() < block_buf_size_ )
-									temp_data.push_back( xx[coord] );
+
+								xx[coord] = fmod((xx[coord] + (*gh.get_grid(ilevel))(i, j, k)) * xfac + header_.BoxSize, header_.BoxSize);
+
+								if (temp_data.size() < block_buf_size_)
+									temp_data.push_back(xx[coord]);
 								else
 								{
-									ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+									ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_data.clear();
-									temp_data.push_back( xx[coord] );
+									temp_data.push_back(xx[coord]);
 								}
 							}
-			
 		}
-		
-		
-		
-		if( temp_data.size() > 0 )
-		{	
-			ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*temp_data.size() );
+
+		if (temp_data.size() > 0)
+		{
+			ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * temp_data.size());
 			nwritten += temp_data.size();
 		}
-		
-		if( nwritten != npart )
+
+		if (nwritten != npart)
 			throw std::runtime_error("Internal consistency error while writing temporary file for positions");
 
 		//... dump to temporary file
-		ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-		
-		if( ofs_temp.bad() )
+		ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+		if (ofs_temp.bad())
 			throw std::runtime_error("I/O error while writing temporary file for positions");
-		
+
 		ofs_temp.close();
-				
-		if( shift != NULL )
+
+		if (shift != NULL)
 			delete[] shift;
-		
 	}
-	
-	void write_dm_velocity( int coord, const grid_hierarchy& gh )
+
+	void write_dm_velocity(int coord, const grid_hierarchy &gh)
 	{
 		//... count number of leaf cells ...//
 		size_t npcoarse = 0, npfine = 0;
-		
-		npfine   = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
-		if( bmultimass_ )
-			npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax()-1);
-		
+
+		npfine = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
+		if (bmultimass_)
+			npcoarse = gh.count_leaf_cells(gh.levelmin(), gh.levelmax() - 1);
+
 		header_.npart[1] = npfine;
 		header_.npart[5] = npcoarse;
 		header_.npartTotal[1] = npfine;
 		header_.npartTotal[5] = npcoarse;
 		header_.npartTotalHighWord[1] = 0;
 		header_.npartTotalHighWord[5] = 0;
-		
+
 		//... collect displacements and convert to absolute coordinates with correct
 		//... units
 		std::vector<T_store> temp_data;
-		temp_data.reserve( block_buf_size_ );
-		
-		float isqrta = 1.0f/sqrt(header_.time);
-		float vfac = isqrta*header_.BoxSize;
-		
-		if( kpcunits_ )
+		temp_data.reserve(block_buf_size_);
+
+		float isqrta = 1.0f / sqrt(header_.time);
+		float vfac = isqrta * header_.BoxSize;
+
+		if (kpcunits_)
 			vfac /= 1000.0;
-		
-		unsigned npart = npfine+npcoarse;
+
+		unsigned npart = npfine + npcoarse;
 		unsigned nwritten = 0;
 		unsigned long long blksize;
-		
+
 		char temp_fname[256];
-		sprintf( temp_fname, "___ic_temp_%05d.bin", 100*id_dm_vel+coord );
-		std::ofstream ofs_temp( temp_fname, std::ios::binary|std::ios::trunc );
-		
-		
-		if( !do_glass_ )
+		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_dm_vel + coord);
+		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
+
+		if (!do_glass_)
 		{
-			
-			blksize = sizeof(T_store)*npart;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			for( int ilevel=levelmax_; ilevel>=(int)levelmin_; --ilevel )
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
-							{	
-								if( temp_data.size() < block_buf_size_ )
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
-								else 
+
+			blksize = sizeof(T_store) * npart;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			for (int ilevel = levelmax_; ilevel >= (int)levelmin_; --ilevel)
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
+							{
+								if (temp_data.size() < block_buf_size_)
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
+								else
 								{
-									ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+									ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_data.clear();
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
-
 							}
 		}
 		else
 		{
-			
-			std::ifstream ofg( fname_glass_cdm_.c_str(), std::ios::binary );
-			
-			if( !ofg.good() )
-				music::elog.Print("could not open glass input file \'%s\'",fname_glass_cdm_.c_str());
-			
+
+			std::ifstream ofg(fname_glass_cdm_.c_str(), std::ios::binary);
+
+			if (!ofg.good())
+				music::elog.Print("could not open glass input file \'%s\'", fname_glass_cdm_.c_str());
+
 			io_header glasshead;
 			unsigned blksz;
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			assert( blksz == sizeof(io_header) );
-			
-			ofg.read( reinterpret_cast<char*>(&glasshead), sizeof( io_header ) );
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			//assert( blksz == glasshead.npart[1]*sizeof(float)*3 );
-			
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			assert(blksz == sizeof(io_header));
+
+			ofg.read(reinterpret_cast<char *>(&glasshead), sizeof(io_header));
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			// assert( blksz == glasshead.npart[1]*sizeof(float)*3 );
+
 			header_.npart[1] = glasshead.npart[1];
 			header_.npartTotal[1] = glasshead.npartTotal[1];
 			header_.npartTotalHighWord[1] = glasshead.npartTotalHighWord[1];
 			float lglass = glasshead.BoxSize;
-			
+
 			// read glass, do interpolation and write
-			size_t npartdone=0;
+			size_t npartdone = 0;
 			size_t npinter = glasshead.npart[1];
-			
-			blksize = sizeof(T_store)*npinter;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			float *pos_tmp = new float[3*block_buf_size_];
-			temp_data.assign( block_buf_size_, 0.0 );
-			
-			while( npartdone < npinter )
+
+			blksize = sizeof(T_store) * npinter;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			float *pos_tmp = new float[3 * block_buf_size_];
+			temp_data.assign(block_buf_size_, 0.0);
+
+			while (npartdone < npinter)
 			{
-				size_t npart2read = std::min(npinter-npartdone,block_buf_size_);
-				
-				ofg.read( reinterpret_cast<char*>(&pos_tmp[0]), npart2read*sizeof(float)*3 );
-				get_cic_velocity( pos_tmp, npart2read, lglass,  gh, &temp_data[0] );
-				ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*npart2read );
-				
+				size_t npart2read = std::min(npinter - npartdone, block_buf_size_);
+
+				ofg.read(reinterpret_cast<char *>(&pos_tmp[0]), npart2read * sizeof(float) * 3);
+				get_cic_velocity(pos_tmp, npart2read, lglass, gh, &temp_data[0]);
+				ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * npart2read);
+
 				npartdone += npart2read;
 				nwritten += npart2read;
-				
 			}
-			
+
 			delete[] pos_tmp;
 			temp_data.clear();
-			
-			for( int ilevel=levelmax_-1; ilevel>=(int)levelmin_; --ilevel )
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
-							{	
-								if( temp_data.size() < block_buf_size_ )
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
-								else 
+
+			for (int ilevel = levelmax_ - 1; ilevel >= (int)levelmin_; --ilevel)
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
+							{
+								if (temp_data.size() < block_buf_size_)
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
+								else
 								{
-									ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+									ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_data.clear();
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
-								
 							}
-			
-			
 		}
-		
-		if( temp_data.size() > 0 )
-		{	
-			ofs_temp.write( (char*)&temp_data[0], temp_data.size()*sizeof(T_store) );
+
+		if (temp_data.size() > 0)
+		{
+			ofs_temp.write((char *)&temp_data[0], temp_data.size() * sizeof(T_store));
 			nwritten += temp_data.size();
 		}
-		
-		if( nwritten != npart )
+
+		if (nwritten != npart)
 			throw std::runtime_error("Internal consistency error while writing temporary file for velocities");
-		
-		ofs_temp.write( (char *)&blksize, sizeof(int) );
-		
-		if( ofs_temp.bad() )
+
+		ofs_temp.write((char *)&blksize, sizeof(int));
+
+		if (ofs_temp.bad())
 			throw std::runtime_error("I/O error while writing temporary file for velocities");
-		
+
 		ofs_temp.close();
 	}
-	
-	void write_dm_density( const grid_hierarchy& gh )
+
+	void write_dm_density(const grid_hierarchy &gh)
 	{
 		//... we don't care about DM density for Gadget
 	}
-	
-	void write_dm_potential( const grid_hierarchy& gh )
-	{ }
-	
-	void write_gas_potential( const grid_hierarchy& gh )
-	{ }
-	
-	
-	
+
+	void write_dm_potential(const grid_hierarchy &gh)
+	{
+	}
+
+	void write_gas_potential(const grid_hierarchy &gh)
+	{
+	}
+
 	//... write data for gas -- don't do this
-	void write_gas_velocity( int coord, const grid_hierarchy& gh )
-	{	
+	void write_gas_velocity(int coord, const grid_hierarchy &gh)
+	{
 		//... count number of leaf cells ...//
 		size_t npcoarse = 0, npfine = 0;
-		
-		npfine   = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
-		
+
+		npfine = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
+
 		header_.npart[2] = npfine;
 		header_.npartTotal[2] = (unsigned)npfine;
-		header_.npartTotalHighWord[2] = (unsigned)(npfine>>32);
-		
+		header_.npartTotalHighWord[2] = (unsigned)(npfine >> 32);
+
 		//... collect displacements and convert to absolute coordinates with correct
 		//... units
 		std::vector<T_store> temp_data;
-		temp_data.reserve( block_buf_size_ );
-		
-		float isqrta = 1.0f/sqrt(header_.time);
-		float vfac = isqrta*header_.BoxSize;
-		
-		if( kpcunits_ )
+		temp_data.reserve(block_buf_size_);
+
+		float isqrta = 1.0f / sqrt(header_.time);
+		float vfac = isqrta * header_.BoxSize;
+
+		if (kpcunits_)
 			vfac /= 1000.0;
-		
-		unsigned npart = npfine+npcoarse;
+
+		unsigned npart = npfine + npcoarse;
 		unsigned nwritten = 0;
-		
+
 		char temp_fname[256];
-		sprintf( temp_fname, "___ic_temp_%05d.bin", 100*id_gas_vel+coord );
-		std::ofstream ofs_temp( temp_fname, std::ios::binary|std::ios::trunc );
+		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_gas_vel + coord);
+		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
 
 		unsigned long long blksize;
-		
-		
-		if(!do_glass_)
+
+		if (!do_glass_)
 		{
-			blksize = sizeof(T_store)*npart;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			
+			blksize = sizeof(T_store) * npart;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
 			const unsigned ilevel = gh.levelmax();
-			const unsigned 
-			nx = gh.get_grid(ilevel)->size(0),
-			ny = gh.get_grid(ilevel)->size(1),
-			nz = gh.get_grid(ilevel)->size(2);
-			
-			for( unsigned i=0; i<nx; ++i )
-				for( unsigned j=0; j<ny; ++j )
-					for( unsigned k=0; k<nz; ++k )
-					{	
-						double v = (*gh.get_grid(ilevel))(i,j,k);
-						
-						if( temp_data.size() < block_buf_size_ )
-							temp_data.push_back( v * vfac );
-						else 
+			const unsigned
+					nx = gh.get_grid(ilevel)->size(0),
+					ny = gh.get_grid(ilevel)->size(1),
+					nz = gh.get_grid(ilevel)->size(2);
+
+			for (unsigned i = 0; i < nx; ++i)
+				for (unsigned j = 0; j < ny; ++j)
+					for (unsigned k = 0; k < nz; ++k)
+					{
+						double v = (*gh.get_grid(ilevel))(i, j, k);
+
+						if (temp_data.size() < block_buf_size_)
+							temp_data.push_back(v * vfac);
+						else
 						{
-							ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+							ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 							nwritten += block_buf_size_;
 							temp_data.clear();
-							temp_data.push_back( v * vfac );
+							temp_data.push_back(v * vfac);
 						}
-						
 					}
-		}else{
-			
-			std::ifstream ofg( fname_glass_baryon_.c_str(), std::ios::binary );
-			
-			if( !ofg.good() )
-				music::elog.Print("could not open glass input file \'%s\'",fname_glass_cdm_.c_str());
-			
+		}
+		else
+		{
+
+			std::ifstream ofg(fname_glass_baryon_.c_str(), std::ios::binary);
+
+			if (!ofg.good())
+				music::elog.Print("could not open glass input file \'%s\'", fname_glass_cdm_.c_str());
+
 			io_header glasshead;
 			unsigned blksz;
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			assert( blksz == sizeof(io_header) );
-			
-			ofg.read( reinterpret_cast<char*>(&glasshead), sizeof( io_header ) );
-			
-			//size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			assert(blksz == sizeof(io_header));
+
+			ofg.read(reinterpret_cast<char *>(&glasshead), sizeof(io_header));
+
+			// size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
 			/*if( nreq != (size_t)glasshead.npart[1] )
 			{
 				music::elog.Print("glass file contains %d particles, but should contain %ld",glasshead.npart[1],nreq);
 				throw std::runtime_error("glass file does not contain the right amount of particles");
 			}*/
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			//assert( blksz == (glasshead.npart[1]+glasshead.npart[2])*sizeof(float)*3 );
-			//ofg.seekg( sizeof(float)*3*glasshead.npart[1], std::ios_base::cur );
-			
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			// assert( blksz == (glasshead.npart[1]+glasshead.npart[2])*sizeof(float)*3 );
+			// ofg.seekg( sizeof(float)*3*glasshead.npart[1], std::ios_base::cur );
+
 			// do the highest level with the glass
 			float lglass = glasshead.BoxSize;
-			
+
 			header_.npart[2] = glasshead.npart[2];
 			header_.npartTotal[2] = glasshead.npartTotal[2];
 			header_.npartTotalHighWord[2] = glasshead.npartTotalHighWord[2];
-			
+
 			// read glass, do interpolation and write
-			size_t npartdone=0;
+			size_t npartdone = 0;
 			size_t npinter = (size_t)glasshead.npart[2];
-			
-			
-			blksize = sizeof(T_store)*npinter;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			float *pos_tmp = new float[3*block_buf_size_];
-			temp_data.assign( block_buf_size_, 0.0 );
-			
-			while( npartdone < npinter )
+
+			blksize = sizeof(T_store) * npinter;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			float *pos_tmp = new float[3 * block_buf_size_];
+			temp_data.assign(block_buf_size_, 0.0);
+
+			while (npartdone < npinter)
 			{
-				size_t npart2read = std::min(npinter-npartdone,block_buf_size_);
-				
-				ofg.read( reinterpret_cast<char*>(&pos_tmp[0]), npart2read*sizeof(float)*3 );
-				get_cic_velocity( pos_tmp, npart2read, lglass,  gh, &temp_data[0] );
-				ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*npart2read );
-				
+				size_t npart2read = std::min(npinter - npartdone, block_buf_size_);
+
+				ofg.read(reinterpret_cast<char *>(&pos_tmp[0]), npart2read * sizeof(float) * 3);
+				get_cic_velocity(pos_tmp, npart2read, lglass, gh, &temp_data[0]);
+				ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * npart2read);
+
 				npartdone += npart2read;
 				nwritten += npart2read;
 			}
-			
+
 			delete[] pos_tmp;
 			temp_data.clear();
-			
-			for( int ilevel=levelmax_-1; ilevel>=(int)levelmin_; --ilevel )
-				for( unsigned i=0; i<gh.get_grid(ilevel)->size(0); ++i )
-					for( unsigned j=0; j<gh.get_grid(ilevel)->size(1); ++j )
-						for( unsigned k=0; k<gh.get_grid(ilevel)->size(2); ++k )
-							if( ! gh.is_refined(ilevel,i,j,k) )
-							{	
-								if( temp_data.size() < block_buf_size_ )
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
-								else 
+
+			for (int ilevel = levelmax_ - 1; ilevel >= (int)levelmin_; --ilevel)
+				for (unsigned i = 0; i < gh.get_grid(ilevel)->size(0); ++i)
+					for (unsigned j = 0; j < gh.get_grid(ilevel)->size(1); ++j)
+						for (unsigned k = 0; k < gh.get_grid(ilevel)->size(2); ++k)
+							if (!gh.is_refined(ilevel, i, j, k))
+							{
+								if (temp_data.size() < block_buf_size_)
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
+								else
 								{
-									ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+									ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 									nwritten += block_buf_size_;
 									temp_data.clear();
-									temp_data.push_back( (*gh.get_grid(ilevel))(i,j,k) * vfac );
+									temp_data.push_back((*gh.get_grid(ilevel))(i, j, k) * vfac);
 								}
-								
 							}
 		}
-			
-		if( temp_data.size() > 0 )
-		{	
-			ofs_temp.write( (char*)&temp_data[0], temp_data.size()*sizeof(T_store) );
+
+		if (temp_data.size() > 0)
+		{
+			ofs_temp.write((char *)&temp_data[0], temp_data.size() * sizeof(T_store));
 			nwritten += temp_data.size();
 		}
-		
-		if( nwritten != npart )
+
+		if (nwritten != npart)
 			throw std::runtime_error("Internal consistency error while writing temporary file for gas velocities");
-		
-		ofs_temp.write( (char *)&blksize, sizeof(int) );
-		
-		if( ofs_temp.bad() )
+
+		ofs_temp.write((char *)&blksize, sizeof(int));
+
+		if (ofs_temp.bad())
 			throw std::runtime_error("I/O error while writing temporary file for gas velocities");
-		
+
 		ofs_temp.close();
 	}
-	
-	
+
 	//... write only for fine level
-	void write_gas_position( int coord, const grid_hierarchy& gh )
-	{	
+	void write_gas_position(int coord, const grid_hierarchy &gh)
+	{
 		//... count number of leaf cells ...//
 		unsigned long long npfine = 0;
-		
-		npfine   = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
-		
+
+		npfine = gh.count_leaf_cells(gh.levelmax(), gh.levelmax());
+
 		//... determine if we need to shift the coordinates back
 		double *shift = NULL;
-		
-		if( cf_.get_value_safe<bool>("output","shift_back",false ) )
+
+		if (cf_.get_value_safe<bool>("output", "shift_back", false))
 		{
-			if( coord == 0 )
+			if (coord == 0)
 				std::cout << " - gadget2 output plug-in will shift particle positions back...\n";
-			
-			double h = 1.0/(1<<levelmin_);
+
+			double h = 1.0 / (1 << levelmin_);
 			shift = new double[3];
-			shift[0] = -(double)cf_.get_value<int>( "setup", "shift_x" )*h;
-			shift[1] = -(double)cf_.get_value<int>( "setup", "shift_y" )*h;
-			shift[2] = -(double)cf_.get_value<int>( "setup", "shift_z" )*h;
+			shift[0] = -(double)cf_.get_value<int>("setup", "shift_x") * h;
+			shift[1] = -(double)cf_.get_value<int>("setup", "shift_y") * h;
+			shift[2] = -(double)cf_.get_value<int>("setup", "shift_z") * h;
 		}
-		
+
 		unsigned long long npart = npfine;
 		unsigned long long nwritten = 0;
-		
+
 		//...
 		header_.npart[2] = npfine;
 		header_.npartTotal[2] = (unsigned)npfine;
-		header_.npartTotalHighWord[2] = (unsigned)(npfine>>32);
-		
-		//header_.num_files = (int)ceil((double)npart/(double)npartmax_);
-		
+		header_.npartTotalHighWord[2] = (unsigned)(npfine >> 32);
+
+		// header_.num_files = (int)ceil((double)npart/(double)npartmax_);
+
 		//... collect displacements and convert to absolute coordinates with correct
 		//... units
 		std::vector<T_store> temp_data;
-		temp_data.reserve( block_buf_size_ );
-		
-		
+		temp_data.reserve(block_buf_size_);
+
 		char temp_fname[256];
-		sprintf( temp_fname, "___ic_temp_%05d.bin", 100*id_gas_pos+coord );
-		std::ofstream ofs_temp( temp_fname, std::ios::binary|std::ios::trunc );
-		
+		sprintf(temp_fname, "___ic_temp_%05d.bin", 100 * id_gas_pos + coord);
+		std::ofstream ofs_temp(temp_fname, std::ios::binary | std::ios::trunc);
+
 		unsigned long long blksize;
-		
+
 		double xfac = header_.BoxSize;
-        
-        //... shift particle positions (this has to be done as the same shift
-        //... is used when computing the convolution kernel for SPH baryons)
-        if( do_baryons_ )
-        {
-            double h = 1. / (1<<gh.levelmax());
-            
-            if( shift == NULL )
-            {
-                shift = new double[3];
-                shift[0] = 0.0;
-                shift[1] = 0.0;
-                shift[2] = 0.0;
-                
-            }
-            shift[0] += 0.5 * h * omegac_/omegam_;
-            shift[1] += 0.5 * h * omegac_/omegam_;
-            shift[2] += 0.5 * h * omegac_/omegam_;
-        }
-		
-		//... only do finest grid
-		if( !do_glass_ )
+
+		//... shift particle positions (this has to be done as the same shift
+		//... is used when computing the convolution kernel for SPH baryons)
+		if (do_baryons_)
 		{
-			
-			blksize = sizeof(T_store)*npart;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
+			double h = 1. / (1 << gh.levelmax());
+
+			if (shift == NULL)
+			{
+				shift = new double[3];
+				shift[0] = 0.0;
+				shift[1] = 0.0;
+				shift[2] = 0.0;
+			}
+			shift[0] += 0.5 * h * omegac_ / omegam_;
+			shift[1] += 0.5 * h * omegac_ / omegam_;
+			shift[2] += 0.5 * h * omegac_ / omegam_;
+		}
+
+		//... only do finest grid
+		if (!do_glass_)
+		{
+
+			blksize = sizeof(T_store) * npart;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
 			const unsigned ilevel = gh.levelmax();
-			const unsigned 
-				nx = gh.get_grid(ilevel)->size(0),
-				ny = gh.get_grid(ilevel)->size(1),
-				nz = gh.get_grid(ilevel)->size(2);
-			
-			for( unsigned i=0; i<nx; ++i )
-				for( unsigned j=0; j<ny; ++j )
-					for( unsigned k=0; k<nz; ++k )
-					{	
+			const unsigned
+					nx = gh.get_grid(ilevel)->size(0),
+					ny = gh.get_grid(ilevel)->size(1),
+					nz = gh.get_grid(ilevel)->size(2);
+
+			for (unsigned i = 0; i < nx; ++i)
+				for (unsigned j = 0; j < ny; ++j)
+					for (unsigned k = 0; k < nz; ++k)
+					{
 						double xx[3];
 						gh.cell_pos(ilevel, i, j, k, xx);
-						if( shift != NULL )
+						if (shift != NULL)
 							xx[coord] += shift[coord];
-						
-						double v = (*gh.get_grid(ilevel))(i,j,k);
-						
-						xx[coord] = fmod( (xx[coord]+v)*xfac + header_.BoxSize, header_.BoxSize );
-						
-						if( temp_data.size() < block_buf_size_ )
-							temp_data.push_back( xx[coord] );
+
+						double v = (*gh.get_grid(ilevel))(i, j, k);
+
+						xx[coord] = fmod((xx[coord] + v) * xfac + header_.BoxSize, header_.BoxSize);
+
+						if (temp_data.size() < block_buf_size_)
+							temp_data.push_back(xx[coord]);
 						else
 						{
-							ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*block_buf_size_ );
+							ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * block_buf_size_);
 							nwritten += block_buf_size_;
 							temp_data.clear();
-							temp_data.push_back( xx[coord] );
+							temp_data.push_back(xx[coord]);
 						}
-						
 					}
-		}else{
-			
-			std::ifstream ofg( fname_glass_baryon_.c_str(), std::ios::binary );
-			
-			if( !ofg.good() )
-				music::elog.Print("could not open glass input file \'%s\'",fname_glass_cdm_.c_str());
-			
+		}
+		else
+		{
+
+			std::ifstream ofg(fname_glass_baryon_.c_str(), std::ios::binary);
+
+			if (!ofg.good())
+				music::elog.Print("could not open glass input file \'%s\'", fname_glass_cdm_.c_str());
+
 			io_header glasshead;
 			unsigned blksz;
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			assert( blksz == sizeof(io_header) );
-			
-			ofg.read( reinterpret_cast<char*>(&glasshead), sizeof( io_header ) );
-			
-			//size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			assert(blksz == sizeof(io_header));
+
+			ofg.read(reinterpret_cast<char *>(&glasshead), sizeof(io_header));
+
+			// size_t nreq = gh.size(gh.levelmax(), 0)*gh.size(gh.levelmax(), 1)*gh.size(gh.levelmax(), 2);
 			/*if( nreq != (size_t)glasshead.npart[1] )
 			{
 				music::elog.Print("glass file contains %d particles, but should contain %ld",glasshead.npart[1],nreq);
 				throw std::runtime_error("glass file does not contain the right amount of particles");
 			}*/
-			
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			ofg.read( reinterpret_cast<char*>(&blksz), sizeof(unsigned) );
-			//assert( blksz == (glasshead.npart[1]+glasshead.npart[2])*sizeof(float)*3 );
-			//ofg.seekg( sizeof(float)*3*glasshead.npart[1], std::ios_base::cur );
-			
+
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			ofg.read(reinterpret_cast<char *>(&blksz), sizeof(unsigned));
+			// assert( blksz == (glasshead.npart[1]+glasshead.npart[2])*sizeof(float)*3 );
+			// ofg.seekg( sizeof(float)*3*glasshead.npart[1], std::ios_base::cur );
+
 			float lglass = glasshead.BoxSize;
-			
-			
-			blksize = sizeof(T_store)*glasshead.npart[1];
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
+
+			blksize = sizeof(T_store) * glasshead.npart[1];
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
 			header_.npart[2] = glasshead.npart[2];
 			header_.npartTotal[2] = glasshead.npartTotal[2];
 			header_.npartTotalHighWord[2] = glasshead.npartTotalHighWord[2];
-			
+
 			double rhoc = 27.7519737;
-			if( kpcunits_ )
+			if (kpcunits_)
 				rhoc *= 10.0; // in h^2 M_sol / kpc^3
-			
-			header_.mass[2] = omegab_ * rhoc * pow(header_.BoxSize,3.)/(glasshead.npart[2]);
-			
+
+			header_.mass[2] = omegab_ * rhoc * pow(header_.BoxSize, 3.) / (glasshead.npart[2]);
+
 			// read glass, do interpolation and write
-			size_t npartdone=0;
+			size_t npartdone = 0;
 			size_t npinter = glasshead.npart[2];
-			
-			blksize = sizeof(T_store)*npinter;
-			ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-			
-			float *pos_tmp = new float[3*block_buf_size_];
-			temp_data.assign( block_buf_size_, 0.0 );
-			
-			while( npartdone < npinter )
+
+			blksize = sizeof(T_store) * npinter;
+			ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+			float *pos_tmp = new float[3 * block_buf_size_];
+			temp_data.assign(block_buf_size_, 0.0);
+
+			while (npartdone < npinter)
 			{
-				size_t npart2read = std::min(npinter-npartdone,block_buf_size_);
-				
-				ofg.read( reinterpret_cast<char*>(&pos_tmp[0]), npart2read*sizeof(float)*3 );
-				get_cic_displacement( coord, pos_tmp, npart2read, lglass,  gh, &temp_data[0] );
-				ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*npart2read );
-				
+				size_t npart2read = std::min(npinter - npartdone, block_buf_size_);
+
+				ofg.read(reinterpret_cast<char *>(&pos_tmp[0]), npart2read * sizeof(float) * 3);
+				get_cic_displacement(coord, pos_tmp, npart2read, lglass, gh, &temp_data[0]);
+				ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * npart2read);
+
 				npartdone += npart2read;
 				nwritten += npart2read;
-				
 			}
-			
+
 			delete[] pos_tmp;
-			temp_data.clear();		
+			temp_data.clear();
 		}
-		
-		if( temp_data.size() > 0 )
-		{	
-			ofs_temp.write( (char*)&temp_data[0], sizeof(T_store)*temp_data.size() );
+
+		if (temp_data.size() > 0)
+		{
+			ofs_temp.write((char *)&temp_data[0], sizeof(T_store) * temp_data.size());
 			nwritten += temp_data.size();
 		}
-		
-		if( nwritten != npart )
+
+		if (nwritten != npart)
 			throw std::runtime_error("Internal consistency error while writing temporary file for gas positions");
-		
+
 		//... dump to temporary file
-		ofs_temp.write( (char *)&blksize, sizeof(unsigned long long) );
-		
-		if( ofs_temp.bad() )
+		ofs_temp.write((char *)&blksize, sizeof(unsigned long long));
+
+		if (ofs_temp.bad())
 			throw std::runtime_error("I/O error while writing temporary file for gas positions");
-		
+
 		ofs_temp.close();
-		
-		if( shift != NULL )
+
+		if (shift != NULL)
 			delete[] shift;
 	}
-	
-	void write_gas_density( const grid_hierarchy& gh )
-	{	
+
+	void write_gas_density(const grid_hierarchy &gh)
+	{
 		double rhoc = 27.7519737; // h^2 1e10 M_sol / Mpc^3
-		
-		if( kpcunits_ )
+
+		if (kpcunits_)
 			rhoc *= 10.0; // in h^2 M_sol / kpc^3
-		
-		if( do_baryons_ && !do_glass_ )
-			header_.mass[2] = omegab_ * rhoc * pow(header_.BoxSize,3.)/pow(2,3*levelmax_);
+
+		if (do_baryons_ && !do_glass_)
+			header_.mass[2] = omegab_ * rhoc * pow(header_.BoxSize, 3.) / pow(2, 3 * levelmax_);
 	}
-	
-	void finalize( void )
-	{	
+
+	void finalize(void)
+	{
 		this->assemble_gadget_file();
 	}
 };
 
-
-
-namespace{
-	output_plugin_creator_concrete< gadget2_2comp_output_plugin<float> > creator1("gadget2_2c");
-#ifndef SINGLE_PRECISION
-	output_plugin_creator_concrete< gadget2_2comp_output_plugin<double> > creator2("gadget2_2c_double");
-#endif
+namespace
+{
+	output_plugin_creator_concrete<gadget2_2comp_output_plugin<float>> creator1("gadget2_2c");
+	output_plugin_creator_concrete<gadget2_2comp_output_plugin<double>> creator2("gadget2_2c_double");
 }
-
diff --git a/src/plugins/output_gadget_tetmesh.cc b/src/plugins/output_gadget_tetmesh.cc
index 24e9b39..bd8d957 100644
--- a/src/plugins/output_gadget_tetmesh.cc
+++ b/src/plugins/output_gadget_tetmesh.cc
@@ -1573,8 +1573,6 @@ public:
 
 namespace{
 	output_plugin_creator_concrete< gadget_tetmesh_output_plugin<float> > creator1("gadget_tetmesh");
-#ifndef SINGLE_PRECISION
 	output_plugin_creator_concrete< gadget_tetmesh_output_plugin<double> > creator2("gadget_tetmesh_double");
-#endif
 }
 
diff --git a/src/plugins/output_tipsy.cc b/src/plugins/output_tipsy.cc
index beee4fc..2b81efd 100644
--- a/src/plugins/output_tipsy.cc
+++ b/src/plugins/output_tipsy.cc
@@ -1107,9 +1107,7 @@ int tipsy_output_plugin<double>::xdr_dump( XDR *xdrs, double*p )
 
 namespace{
     output_plugin_creator_concrete< tipsy_output_plugin<float> > creator1("tipsy");
-    //#ifndef SINGLE_PRECISION
     output_plugin_creator_concrete< tipsy_output_plugin<double> > creator2("tipsy_double");
-    //#endif
 }
 
 
diff --git a/src/plugins/output_tipsy_resample.cc b/src/plugins/output_tipsy_resample.cc
index 41e68ad..41ef8dc 100644
--- a/src/plugins/output_tipsy_resample.cc
+++ b/src/plugins/output_tipsy_resample.cc
@@ -1396,9 +1396,7 @@ int tipsy_output_plugin_res < double >::xdr_dump (XDR * xdrs, double *p)
 namespace
 {
   output_plugin_creator_concrete< tipsy_output_plugin_res<float> >creator1 ("tipsy_resample");
-#ifndef SINGLE_PRECISION
   output_plugin_creator_concrete< tipsy_output_plugin_res<double> >creator2 ("tipsy_double_resample");
-#endif
 }
 
 #endif // defined(HAVE_TIRPC) 
\ No newline at end of file
diff --git a/src/plugins/random_music_wnoise_generator.cc b/src/plugins/random_music_wnoise_generator.cc
index f3e7439..61d4dd2 100644
--- a/src/plugins/random_music_wnoise_generator.cc
+++ b/src/plugins/random_music_wnoise_generator.cc
@@ -40,8 +40,8 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
 			seedtable[(res - 1 - j) * res + (res - 1 - i)] = 0x7fffffff * gsl_rng_uniform(random_generator);
 	}
 
-	fftw_real *rnoise = new fftw_real[res * res * (res + 2)];
-	fftw_complex *knoise = reinterpret_cast<fftw_complex *>(rnoise);
+	real_t *rnoise = new real_t[res * res * (res + 2)];
+	complex_t *knoise = reinterpret_cast<complex_t *>(rnoise);
 
 	double fnorm = 1. / sqrt(res * res * res);
 
@@ -126,26 +126,9 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
 	delete[] seedtable;
 
 	//... perform FT to real space
-
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan plan = fftwf_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
-	fftwf_execute(plan);
-	fftwf_destroy_plan(plan);
-#else
-	fftw_plan plan = fftw_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
-	fftw_execute(plan);
-	fftw_destroy_plan(plan);
-#endif
-#else
-	rfftwnd_plan plan = rfftw3d_create_plan(res, res, res, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), plan, knoise, NULL);
-#else
-	rfftwnd_one_complex_to_real(plan, knoise, NULL);
-#endif
-	rfftwnd_destroy_plan(plan);
-#endif
+	fftw_plan_t plan = FFTW_API(plan_dft_c2r_3d)(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
+	FFTW_API(execute)(plan);
+	FFTW_API(destroy_plan)(plan);
 
 	// copy to array that holds the random numbers
 
@@ -443,37 +426,25 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
   ncubes_ = 1;
   baseseed_ = -2;
 
-  if (sizeof(fftw_real) != sizeof(T))
+  if (sizeof(real_t) != sizeof(T))
   {
-    music::elog.Print("type mismatch with fftw_real in k-space averaging");
-    throw std::runtime_error("type mismatch with fftw_real in k-space averaging");
+    music::elog.Print("type mismatch with real_t in k-space averaging");
+    throw std::runtime_error("type mismatch with real_t in k-space averaging");
   }
 
-  fftw_real
-      *rfine = new fftw_real[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)],
-      *rcoarse = new fftw_real[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)];
+  real_t
+      *rfine = new real_t[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)],
+      *rcoarse = new real_t[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)];
 
-  fftw_complex
-      *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse),
-      *cfine = reinterpret_cast<fftw_complex *>(rfine);
+  complex_t
+      *ccoarse = reinterpret_cast<complex_t *>(rcoarse),
+      *cfine = reinterpret_cast<complex_t *>(rfine);
 
   int nx(rc.res_), ny(rc.res_), nz(rc.res_), nxc(res_), nyc(res_), nzc(res_);
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_plan
-      pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-      ipc = fftwf_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
-#else
-  fftw_plan
-      pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-      ipc = fftw_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
-#endif
 
-#else
-  rfftwnd_plan
-      pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-      ipc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
+  fftw_plan_t
+      pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
+      ipc = FFTW_API(plan_dft_c2r_3d)(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
 
 #pragma omp parallel for
   for (int i = 0; i < nx; i++)
@@ -484,19 +455,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
         rfine[q] = rc(i, j, k);
       }
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_execute(pf);
-#else
-  fftw_execute(pf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
-#else
-  rfftwnd_one_real_to_complex(pf, rfine, NULL);
-#endif
-#endif
+  FFTW_API(execute)(pf);
 
   double fftnorm = 1.0 / ((double)nxc * (double)nyc * (double)nzc);
 
@@ -532,19 +491,9 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
       }
 
   delete[] rfine;
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_execute(ipc);
-#else
-  fftw_execute(ipc);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-  rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
-#else
-  rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
-#endif
-#endif
+
+  FFTW_API(execute)(ipc);
+
   rnums_.push_back(new Meshvar<T>(res_, 0, 0, 0));
   cubemap_[0] = 0; // map all to single array
 
@@ -563,18 +512,8 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
 
   delete[] rcoarse;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_destroy_plan(pf);
-  fftwf_destroy_plan(ipc);
-#else
-  fftw_destroy_plan(pf);
-  fftw_destroy_plan(ipc);
-#endif
-#else
-  rfftwnd_destroy_plan(pf);
-  rfftwnd_destroy_plan(ipc);
-#endif
+  FFTW_API(destroy_plan)(pf);
+  FFTW_API(destroy_plan)(ipc);
   
   double rmean, rvar;
   rmean = sum / count;
@@ -617,24 +556,12 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
   size_t nx = lx[0], ny = lx[1], nz = lx[2],
           nxc = lx[0] / 2, nyc = lx[1] / 2, nzc = lx[2] / 2;
 
-  fftw_real *rfine = new fftw_real[nx * ny * (nz + 2l)];
-  fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine);
+  real_t *rfine = new real_t[nx * ny * (nz + 2l)];
+  complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_plan
-      pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-      ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
-#else
-  fftw_plan
-      pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
-      ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
-#endif
-#else
-  rfftwnd_plan
-      pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-      ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
+  fftw_plan_t
+      pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
+      ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
 
 #pragma omp parallel for
   for (int i = 0; i < (int)nx; i++)
@@ -646,18 +573,10 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
       }
   // this->free_all_mem();	// temporarily free memory, allocate again later
 
-  fftw_real *rcoarse = new fftw_real[nxc * nyc * (nzc + 2)];
-  fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse);
+  real_t *rcoarse = new real_t[nxc * nyc * (nzc + 2)];
+  complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_plan pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
-#else
-  fftw_plan pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
-#endif
-#else
-  rfftwnd_plan pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
+  fftw_plan pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
 
 #pragma omp parallel for
   for (int i = 0; i < (int)nxc; i++)
@@ -667,23 +586,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
         size_t q = ((size_t)i * (size_t)nyc + (size_t)j) * (size_t)(nzc + 2) + (size_t)k;
         rcoarse[q] = rc(x0[0] / 2 + i, x0[1] / 2 + j, x0[2] / 2 + k);
       }
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_execute(pc);
-  fftwf_execute(pf);
-#else
-  fftw_execute(pc);
-  fftw_execute(pf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL);
-  rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
-#else
-  rfftwnd_one_real_to_complex(pc, rcoarse, NULL);
-  rfftwnd_one_real_to_complex(pf, rfine, NULL);
-#endif
-#endif
+
+  FFTW_API(execute)(pc);
+  FFTW_API(execute)(pf);
 
   double fftnorm = 1.0 / ((double)nx * (double)ny * (double)nz);
   double sqrt8 = sqrt(8.0);
@@ -747,19 +652,8 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
           IM(cfine[q]) *= fftnorm;
         }
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-    fftwf_execute(ipf);
-#else
-    fftw_execute(ipf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-    rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
-#else
-    rfftwnd_one_complex_to_real(ipf, cfine, NULL);
-#endif
-#endif
+
+    FFTW_API(execute)(ipf);
 
 #pragma omp parallel for
     for (int i = 0; i < (int)nx; i++)
@@ -772,21 +666,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
 
     delete[] rfine;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-    fftwf_destroy_plan(pf);
-    fftwf_destroy_plan(pc);
-    fftwf_destroy_plan(ipf);
-#else
-    fftw_destroy_plan(pf);
-    fftw_destroy_plan(pc);
-    fftw_destroy_plan(ipf);
-#endif
-#else
-    fftwnd_destroy_plan(pf);
-    fftwnd_destroy_plan(pc);
-    fftwnd_destroy_plan(ipf);
-#endif
+    FFTW_API(destroy_plan)(pf);
+    FFTW_API(destroy_plan)(pc);
+    FFTW_API(destroy_plan)(ipf);
   
 }
 
diff --git a/src/plugins/random_panphasia.cc b/src/plugins/random_panphasia.cc
index 1eb7e51..b0aa2d2 100644
--- a/src/plugins/random_panphasia.cc
+++ b/src/plugins/random_panphasia.cc
@@ -238,63 +238,25 @@ public:
 void RNG_panphasia::forward_transform_field(real_t *field, int nx, int ny, int nz)
 {
 
-  fftw_real *rfield = reinterpret_cast<fftw_real *>(field);
-  fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field);
+  real_t *rfield = reinterpret_cast<real_t *>(field);
+  complex_t *cfield = reinterpret_cast<complex_t *>(field);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_plan pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
-#else
-  fftw_plan pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
-#endif
-#else
-  rfftwnd_plan pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
+  fftw_plan_t pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_execute(pf);
-#else
-  fftw_execute(pf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-  rfftwnd_threads_one_real_to_complex(num_threads_, pf, rfield, NULL);
-#else
-  rfftwnd_one_real_to_complex(pf, rfield, NULL);
-#endif
-#endif
+  FFTW_API(execute)(pf);
+
+  FFTW_API(destroy_plan)(pf);
 }
 
 void RNG_panphasia::backward_transform_field(real_t *field, int nx, int ny, int nz)
 {
 
-  fftw_real *rfield = reinterpret_cast<fftw_real *>(field);
-  fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field);
+  real_t *rfield = reinterpret_cast<real_t *>(field);
+  complex_t *cfield = reinterpret_cast<complex_t *>(field);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_plan ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
-#else
-  fftw_plan ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
-#endif
-#else
-  rfftwnd_plan ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-#endif
-
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-  fftwf_execute(ipf);
-#else
-  fftw_execute(ipf);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-  rfftwnd_threads_one_complex_to_real(num_threads_, ipf, cfield, NULL);
-#else
-  rfftwnd_one_complex_to_real(ipf, cfield, NULL);
-#endif
-#endif
+  fftw_plan_t ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
+  FFTW_API(execute)(ipf);
+  FFTW_API(destroy_plan(ipf));
 }
 
 #include <sys/time.h>
@@ -309,8 +271,8 @@ inline double get_wtime(void)
 
 void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
 {
-  fftw_real *pr0, *pr1, *pr2, *pr3, *pr4;
-  fftw_complex *pc0, *pc1, *pc2, *pc3, *pc4;
+  real_t *pr0, *pr1, *pr2, *pr3, *pr4;
+  complex_t *pc0, *pc1, *pc2, *pc3, *pc4;
 
   // determine resolution and offset so that we can do proper resampling
   int ileft[3], ileft_corner[3], nx[3], nxremap[3];
@@ -379,17 +341,17 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
 
   size_t ngp = nxremap[0] * nxremap[1] * (nxremap[2] + 2);
 
-  pr0 = new fftw_real[ngp];
-  pr1 = new fftw_real[ngp];
-  pr2 = new fftw_real[ngp];
-  pr3 = new fftw_real[ngp];
-  pr4 = new fftw_real[ngp];
+  pr0 = new real_t[ngp];
+  pr1 = new real_t[ngp];
+  pr2 = new real_t[ngp];
+  pr3 = new real_t[ngp];
+  pr4 = new real_t[ngp];
 
-  pc0 = reinterpret_cast<fftw_complex *>(pr0);
-  pc1 = reinterpret_cast<fftw_complex *>(pr1);
-  pc2 = reinterpret_cast<fftw_complex *>(pr2);
-  pc3 = reinterpret_cast<fftw_complex *>(pr3);
-  pc4 = reinterpret_cast<fftw_complex *>(pr4);
+  pc0 = reinterpret_cast<complex_t *>(pr0);
+  pc1 = reinterpret_cast<complex_t *>(pr1);
+  pc2 = reinterpret_cast<complex_t *>(pr2);
+  pc3 = reinterpret_cast<complex_t *>(pr3);
+  pc4 = reinterpret_cast<complex_t *>(pr4);
 
   music::ilog.Print("calculating PANPHASIA random numbers for level %d...", level);
   clear_panphasia_thread_states();
@@ -782,7 +744,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
   {
 
     music::ulog.Print("Remapping fields from dimension %d -> %d", nxremap[0], nx_m[0]);
-    memset(pr1, 0, ngp * sizeof(fftw_real));
+    memset(pr1, 0, ngp * sizeof(real_t));
 
     #pragma omp parallel for
     for (int i = 0; i < nxremap[0]; i++)
@@ -812,7 +774,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
           }
         }
 
-    memcpy(pr0, pr1, ngp * sizeof(fftw_real));
+    memcpy(pr0, pr1, ngp * sizeof(real_t));
   }
 
   // if (level == 9)
diff --git a/src/poisson.cc b/src/poisson.cc
index adab786..038b1b7 100644
--- a/src/poisson.cc
+++ b/src/poisson.cc
@@ -10,8 +10,8 @@
 
 /****** ABSTRACT FACTORY PATTERN IMPLEMENTATION *******/
 
-#include "poisson.hh"
-#include "Numerics.hh"
+#include <poisson.hh>
+#include <Numerics.hh>
 
 std::map<std::string, poisson_plugin_creator *> &
 get_poisson_plugin_map()
@@ -40,23 +40,18 @@ void print_poisson_plugins()
 
 /****** CALL IMPLEMENTATIONS OF POISSON SOLVER CLASSES ******/
 
-#include "mg_solver.hh"
-#include "fd_schemes.hh"
+#include <mg_solver.hh>
+#include <fd_schemes.hh>
 
-#ifdef SINGLE_PRECISION
-typedef multigrid::solver<stencil_7P<float>, interp_O3_fluxcorr, mg_straight, float> poisson_solver_O2;
-typedef multigrid::solver<stencil_13P<float>, interp_O5_fluxcorr, mg_straight, float> poisson_solver_O4;
-typedef multigrid::solver<stencil_19P<float>, interp_O7_fluxcorr, mg_straight, float> poisson_solver_O6;
-#else
-typedef multigrid::solver<stencil_7P<double>, interp_O3_fluxcorr, mg_straight, double> poisson_solver_O2;
-typedef multigrid::solver<stencil_13P<double>, interp_O5_fluxcorr, mg_straight, double> poisson_solver_O4;
-typedef multigrid::solver<stencil_19P<double>, interp_O7_fluxcorr, mg_straight, double> poisson_solver_O6;
-#endif
+
+typedef multigrid::solver<stencil_7P, interp_O3_fluxcorr, mg_straight> poisson_solver_O2;
+typedef multigrid::solver<stencil_13P, interp_O5_fluxcorr, mg_straight> poisson_solver_O4;
+typedef multigrid::solver<stencil_19P, interp_O7_fluxcorr, mg_straight> poisson_solver_O6;
 
 /**************************************************************************************/
 /**************************************************************************************/
 
-double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
+real_t multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 {
 	music::ulog.Print("Initializing multi-grid Poisson solver...");
 
@@ -68,11 +63,11 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 		std::cout << " - Invoking multi-grid Poisson solver..." << std::endl;
 	}
 
-	double acc = 1e-5, err;
+	real_t acc = 1e-5, err;
 	std::string ps_smoother_name;
 	unsigned ps_presmooth, ps_postsmooth, order;
 
-	acc = cf_.get_value_safe<double>("poisson", "accuracy", acc);
+	acc = cf_.get_value_safe<real_t>("poisson", "accuracy", acc);
 	ps_presmooth = cf_.get_value_safe<unsigned>("poisson", "pre_smooth", 3);
 	ps_postsmooth = cf_.get_value_safe<unsigned>("poisson", "post_smooth", 3);
 	ps_smoother_name = cf_.get_value_safe<std::string>("poisson", "smoother", "gs");
@@ -102,12 +97,12 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 							<< "            reverting to \'gs\' (Gauss-Seidel)" << std::endl;
 	}
 
-	double tstart, tend;
+	real_t tstart, tend;
 
 #ifndef SINGLETHREAD_FFTW
 	tstart = omp_get_wtime();
 #else
-	tstart = (double)clock() / CLOCKS_PER_SEC;
+	tstart = (real_t)clock() / CLOCKS_PER_SEC;
 #endif
 
 	//----- run Poisson solver -----//
@@ -142,7 +137,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	if (verbosity > 1)
 		std::cout << " - Poisson solver took " << tend - tstart << "s with " << omp_get_max_threads() << " threads." << std::endl;
 #else
-	tend = (double)clock() / CLOCKS_PER_SEC;
+	tend = (real_t)clock() / CLOCKS_PER_SEC;
 	if (verbosity > 1)
 		std::cout << " - Poisson solver took " << tend - tstart << "s." << std::endl;
 
@@ -151,7 +146,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	return err;
 }
 
-double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 	Du = u;
 
@@ -176,7 +171,7 @@ double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hiera
 	return 0.0;
 }
 
-double multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 	// Du = u;
 
@@ -207,7 +202,7 @@ void multigrid_poisson_plugin::implementation::gradient_O2(int dir, grid_hierarc
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		if (dir == 0)
@@ -241,7 +236,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O2(int dir, grid_hie
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		if (dir == 0)
@@ -275,7 +270,7 @@ void multigrid_poisson_plugin::implementation::gradient_O4(int dir, grid_hierarc
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		h /= 12.0;
@@ -311,7 +306,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O4(int dir, grid_hie
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		h /= 12.0;
@@ -347,7 +342,7 @@ void multigrid_poisson_plugin::implementation::gradient_O6(int dir, grid_hierarc
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		h /= 60.;
@@ -385,7 +380,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
 
 	for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
 	{
-		double h = pow(2.0, ilevel);
+		real_t h = pow(2.0, ilevel);
 		meshvar_bnd *pvar = Du.get_grid(ilevel);
 
 		h /= 60.;
@@ -421,7 +416,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
 /**************************************************************************************/
 #include "general.hh"
 
-double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
+real_t fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 {
 	music::ulog.Print("Entering k-space Poisson solver...");
 
@@ -446,8 +441,8 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	nzp = 2 * (nz / 2 + 1);
 
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@@ -461,37 +456,14 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	//... perform FFT and Poisson solve................................
 	music::ulog.Print("Performing forward transform.");
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan
-			plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-			iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
+	fftw_plan_t
+			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
+			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
 
-	fftwf_execute(plan);
-#else
-	fftw_plan
-			plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-			iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
+	FFTW_API(execute)(plan);
 
-	fftw_execute(plan);
-#endif
-
-#else
-	rfftwnd_plan
-			plan = rfftw3d_create_plan(nx, ny, nz,
-																 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-			iplan = rfftw3d_create_plan(nx, ny, nz,
-																	FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
-#else
-	rfftwnd_one_real_to_complex(plan, data, NULL);
-#endif
-
-#endif
-	double kfac = 2.0 * M_PI;
-	double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
+	real_t kfac = 2.0 * M_PI;
+	real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
 
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@@ -504,11 +476,11 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 				int jj = j;
 				if (jj > ny / 2)
 					jj -= ny;
-				double ki = (double)ii;
-				double kj = (double)jj;
-				double kk = (double)k;
+				real_t ki = (real_t)ii;
+				real_t kj = (real_t)jj;
+				real_t kk = (real_t)k;
 
-				double kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk);
+				real_t kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk);
 
 				size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
 
@@ -521,26 +493,9 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 
 	music::ulog.Print("Performing backward transform.");
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(iplan);
-	fftwf_destroy_plan(plan);
-	fftwf_destroy_plan(iplan);
-#else
-	fftw_execute(iplan);
-	fftw_destroy_plan(plan);
-	fftw_destroy_plan(iplan);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
-#else
-	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
-#endif
-
-	rfftwnd_destroy_plan(plan);
-	rfftwnd_destroy_plan(iplan);
-#endif
+	FFTW_API(execute)(iplan);
+	FFTW_API(destroy_plan)(plan);
+	FFTW_API(destroy_plan)(iplan);
 
 //... copy data ..........................................
 #pragma omp parallel for
@@ -596,7 +551,7 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
 	return 0.0;
 }
 
-double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
+real_t fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
 {
 
 	music::ulog.Print("Computing a gradient in k-space...\n");
@@ -612,8 +567,8 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 	nzp = 2 * (nz / 2 + 1);
 
 	//... copy data ..................................................
-	fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp];
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 
 #pragma omp parallel for
 	for (int i = 0; i < nx; ++i)
@@ -625,38 +580,14 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 			}
 
 			//... perform FFT and Poisson solve................................
+	fftw_plan_t
+			plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
+			iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan
-			plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-			iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
+	FFTW_API(execute)(plan);
 
-	fftwf_execute(plan);
-#else
-	fftw_plan
-			plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
-			iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
-
-	fftw_execute(plan);
-#endif
-#else
-	rfftwnd_plan
-			plan = rfftw3d_create_plan(nx, ny, nz,
-																 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
-			iplan = rfftw3d_create_plan(nx, ny, nz,
-																	FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
-#else
-	rfftwnd_one_real_to_complex(plan, data, NULL);
-#endif
-
-#endif
-
-	double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
-	double kfac = 2.0 * M_PI;
+	real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
+	real_t kfac = 2.0 * M_PI;
 
 	bool do_glass = cf_.get_value_safe<bool>("output", "glass", false);
 	bool deconvolve_cic = do_glass | cf_.get_value_safe<bool>("output", "glass_cicdeconvolve", false);
@@ -671,98 +602,54 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 			{
 				size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
 				int ii = i;
-				if (ii > nx / 2)
-					ii -= nx;
+				if (ii > nx / 2) ii -= nx;
 				int jj = j;
-				if (jj > ny / 2)
-					jj -= ny;
-				const double ki = (double)ii;
-				const double kj = (double)jj;
-				const double kk = (double)k;
+				if (jj > ny / 2) jj -= ny;
 
-				const double kkdir[3] = {kfac * ki, kfac * kj, kfac * kk};
-				const double kdir = kkdir[dir];
+				const real_t ki{(real_t)ii};
+				const real_t kj{(real_t)jj};
+				const real_t kk{(real_t)k};
+				const real_t kkdir[3] = {kfac * ki, kfac * kj, kfac * kk};
+				const real_t kdir = kkdir[dir];
 
-				double re = RE(cdata[idx]);
-				double im = IM(cdata[idx]);
+				real_t re = RE(cdata[idx]);
+				real_t im = IM(cdata[idx]);
 
 				RE(cdata[idx]) = fac * im * kdir;
 				IM(cdata[idx]) = -fac * re * kdir;
 
-#ifdef FFTW3
 				if (deconvolve_cic)
 				{
-					double dfx, dfy, dfz;
-					dfx = M_PI * ki / (double)nx;
-					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
-					dfy = M_PI * kj / (double)ny;
-					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
-					dfz = M_PI * kk / (double)nz;
-					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
+					real_t dfx, dfy, dfz;
+					dfx = M_PI * ki / (real_t)nx;
+					dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
+					dfy = M_PI * kj / (real_t)ny;
+					dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
+					dfz = M_PI * kk / (real_t)nz;
+					dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
 
 					dfx = 1.0 / (dfx * dfy * dfz);
 					dfx = dfx * dfx;
-					cdata[idx][0] *= dfx;
-					cdata[idx][1] *= dfx;
+					RE(cdata[idx]) *= dfx;
+					IM(cdata[idx]) *= dfx;
 				}
-#else
-				if (deconvolve_cic)
-				{
-					double dfx, dfy, dfz;
-					dfx = M_PI * ki / (double)nx;
-					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
-					dfy = M_PI * kj / (double)ny;
-					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
-					dfz = M_PI * kk / (double)nz;
-					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
-
-					dfx = 1.0 / (dfx * dfy * dfz);
-					dfx = dfx * dfx;
-
-					cdata[idx].re *= dfx;
-					cdata[idx].im *= dfx;
-				}
-#endif
 
 				if( (dir == 0 && i==nx/2) || (dir == 1 && j==ny/2) || (dir == 2 && k==nz/2) )
 				{
-#ifdef FFTW3
-					cdata[idx][0] = 0.0;
-					cdata[idx][1] = 0.0;
-#else
-					cdata[idx].re = 0.0;
-					cdata[idx].im = 0.0;
-#endif
+					RE(cdata[idx]) = 0.0;
+					IM(cdata[idx]) = 0.0;
 				}
 			}
 
 	RE(cdata[0]) = 0.0;
 	IM(cdata[0]) = 0.0;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(iplan);
-	fftwf_destroy_plan(plan);
-	fftwf_destroy_plan(iplan);
-#else
-	fftw_execute(iplan);
-	fftw_destroy_plan(plan);
-	fftw_destroy_plan(iplan);
-#endif
-
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
-#else
-	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
-#endif
-
-	rfftwnd_destroy_plan(plan);
-	rfftwnd_destroy_plan(iplan);
-#endif
+	FFTW_API(execute)(iplan);
+	FFTW_API(destroy_plan)(plan);
+	FFTW_API(destroy_plan)(iplan);
 
 	//... copy data ..........................................
-	double dmax = 0.0;
+	real_t dmax = 0.0;
 	for (int i = 0; i < nx; ++i)
 		for (int j = 0; j < ny; ++j)
 			for (int k = 0; k < nz; ++k)
@@ -784,35 +671,35 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
 /**************************************************************************************/
 
 template <int order>
-double poisson_hybrid_kernel(int idir, int i, int j, int k, int n)
+real_t poisson_hybrid_kernel(int idir, int i, int j, int k, int n)
 {
 	return 1.0;
 }
 
 template <>
-inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
 {
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
 
-	double
-			ki(M_PI * (double)i / (double)n),
-			kj(M_PI * (double)j / (double)n),
-			kk(M_PI * (double)k / (double)n),
+	real_t
+			ki(M_PI * (real_t)i / (real_t)n),
+			kj(M_PI * (real_t)j / (real_t)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
 
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 
 	if (idir == 0)
-		grad = sin(ki);
+		grad = std::sin(ki);
 	else if (idir == 1)
-		grad = sin(kj);
+		grad = std::sin(kj);
 	else
-		grad = sin(kk);
+		grad = std::sin(kk);
 
-	laplace = 2.0 * ((-cos(ki) + 1.0) + (-cos(kj) + 1.0) + (-cos(kk) + 1.0));
+	laplace = 2.0 * ((-std::cos(ki) + 1.0) + (-std::cos(kj) + 1.0) + (-std::cos(kk) + 1.0));
 
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@@ -824,30 +711,30 @@ inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
 }
 
 template <>
-inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
 {
 
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
 
-	double
-			ki(M_PI * (double)i / (double)n),
-			kj(M_PI * (double)j / (double)n),
-			kk(M_PI * (double)k / (double)n),
+	real_t
+			ki(M_PI * (real_t)i / (real_t)n),
+			kj(M_PI * (real_t)j / (real_t)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
 
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 
 	if (idir == 0)
-		grad = 0.166666666667 * (-sin(2. * ki) + 8. * sin(ki));
+		grad = 0.166666666667 * (-std::sin(2. * ki) + 8. * std::sin(ki));
 	else if (idir == 1)
-		grad = 0.166666666667 * (-sin(2. * kj) + 8. * sin(kj));
+		grad = 0.166666666667 * (-std::sin(2. * kj) + 8. * std::sin(kj));
 	else if (idir == 2)
-		grad = 0.166666666667 * (-sin(2. * kk) + 8. * sin(kk));
+		grad = 0.166666666667 * (-std::sin(2. * kk) + 8. * std::sin(kk));
 
-	laplace = 0.1666666667 * ((cos(2 * ki) - 16. * cos(ki) + 15.) + (cos(2 * kj) - 16. * cos(kj) + 15.) + (cos(2 * kk) - 16. * cos(kk) + 15.));
+	laplace = 0.1666666667 * ((std::cos(2 * ki) - 16. * std::cos(ki) + 15.) + (std::cos(2 * kj) - 16. * std::cos(kj) + 15.) + (std::cos(2 * kk) - 16. * std::cos(kk) + 15.));
 
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@@ -859,29 +746,29 @@ inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
 }
 
 template <>
-inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
+inline real_t poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
 {
-	double
-			ki(M_PI * (double)i / (double)n),
-			kj(M_PI * (double)j / (double)n),
-			kk(M_PI * (double)k / (double)n),
+	real_t
+			ki(M_PI * (real_t)i / (real_t)n),
+			kj(M_PI * (real_t)j / (real_t)n),
+			kk(M_PI * (real_t)k / (real_t)n),
 			kr(sqrt(ki * ki + kj * kj + kk * kk));
 
 	if (i == 0 && j == 0 && k == 0)
 		return 0.0;
 
-	double grad = 1.0, laplace = 1.0;
+	real_t grad = 1.0, laplace = 1.0;
 
 	if (idir == 0)
-		grad = 0.0333333333333 * (sin(3. * ki) - 9. * sin(2. * ki) + 45. * sin(ki));
+		grad = 0.0333333333333 * (std::sin(3. * ki) - 9. * std::sin(2. * ki) + 45. * std::sin(ki));
 	else if (idir == 1)
-		grad = 0.0333333333333 * (sin(3. * kj) - 9. * sin(2. * kj) + 45. * sin(kj));
+		grad = 0.0333333333333 * (std::sin(3. * kj) - 9. * std::sin(2. * kj) + 45. * std::sin(kj));
 	else if (idir == 2)
-		grad = 0.0333333333333 * (sin(3. * kk) - 9. * sin(2. * kk) + 45. * sin(kk));
+		grad = 0.0333333333333 * (std::sin(3. * kk) - 9. * std::sin(2. * kk) + 45. * std::sin(kk));
 
-	laplace = 0.01111111111111 * ((-2. * cos(3.0 * ki) + 27. * cos(2. * ki) - 270. * cos(ki) + 245.) + (-2. * cos(3.0 * kj) + 27. * cos(2. * kj) - 270. * cos(kj) + 245.) + (-2. * cos(3.0 * kk) + 27. * cos(2. * kk) - 270. * cos(kk) + 245.));
+	laplace = 0.01111111111111 * ((-2. * std::cos(3.0 * ki) + 27. * std::cos(2. * ki) - 270. * std::cos(ki) + 245.) + (-2. * std::cos(3.0 * kj) + 27. * std::cos(2. * kj) - 270. * std::cos(kj) + 245.) + (-2. * std::cos(3.0 * kk) + 27. * std::cos(2. * kk) - 270. * std::cos(kk) + 245.));
 
-	double kgrad = 1.0;
+	real_t kgrad = 1.0;
 	if (idir == 0)
 		kgrad = ki;
 	else if (idir == 1)
@@ -896,42 +783,19 @@ inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
 }
 
 template <int order>
-void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic)
+void do_poisson_hybrid(real_t *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic)
 {
-	double fftnorm = 1.0 / ((double)nxp * (double)nyp * (double)nzp);
+	real_t fftnorm = 1.0 / ((real_t)nxp * (real_t)nyp * (real_t)nzp);
 
-	fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data);
+	complex_t *cdata = reinterpret_cast<complex_t *>(data);
 
 	if (deconvolve_cic)
 		music::ilog.Print("CIC deconvolution step is enabled.");
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_plan iplan, plan;
-	plan = fftwf_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
-	iplan = fftwf_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
-	fftwf_execute(plan);
-#else
-	fftw_plan iplan, plan;
-	plan = fftw_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
-	iplan = fftw_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
-	fftw_execute(plan);
-#endif
-#else
-	rfftwnd_plan iplan, plan;
-
-	plan = rfftw3d_create_plan(nxp, nyp, nzp,
-														 FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-	iplan = rfftw3d_create_plan(nxp, nyp, nzp,
-															FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
-
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
-#else
-	rfftwnd_one_real_to_complex(plan, data, NULL);
-#endif
-#endif
+	fftw_plan_t iplan, plan;
+	plan = FFTW_API(plan_dft_r2c_3d)(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
+	iplan = FFTW_API(plan_dft_c2r_3d)(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
+	FFTW_API(execute)(plan);
 
 #pragma omp parallel for
 	for (int i = 0; i < nxp; ++i)
@@ -948,22 +812,22 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
 					kj -= nyp;
 
 				//... apply hybrid correction
-				double dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2);
+				real_t dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2);
 
-				fftw_real re = RE(cdata[ii]), im = IM(cdata[ii]);
+				real_t re = RE(cdata[ii]), im = IM(cdata[ii]);
 
 				RE(cdata[ii]) = -im * dk * fftnorm;
 				IM(cdata[ii]) = re * dk * fftnorm;
 
 				if (deconvolve_cic)
 				{
-					double dfx, dfy, dfz;
-					dfx = M_PI * ki / (double)nxp;
-					dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
-					dfy = M_PI * kj / (double)nyp;
-					dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
-					dfz = M_PI * kk / (double)nzp;
-					dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
+					real_t dfx, dfy, dfz;
+					dfx = M_PI * ki / (real_t)nxp;
+					dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
+					dfy = M_PI * kj / (real_t)nyp;
+					dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
+					dfz = M_PI * kk / (real_t)nzp;
+					dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
 
 					dfx = 1.0 / (dfx * dfy * dfz);
 					dfx = dfx * dfx;
@@ -981,26 +845,9 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
 	RE(cdata[0]) = 0.0;
 	IM(cdata[0]) = 0.0;
 
-#ifdef FFTW3
-#ifdef SINGLE_PRECISION
-	fftwf_execute(iplan);
-	fftwf_destroy_plan(plan);
-	fftwf_destroy_plan(iplan);
-#else
-	fftw_execute(iplan);
-	fftw_destroy_plan(plan);
-	fftw_destroy_plan(iplan);
-#endif
-#else
-#ifndef SINGLETHREAD_FFTW
-	rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
-#else
-	rfftwnd_one_complex_to_real(iplan, cdata, NULL);
-#endif
-
-	rfftwnd_destroy_plan(plan);
-	rfftwnd_destroy_plan(iplan);
-#endif
+	FFTW_API(execute)(iplan);
+	FFTW_API(destroy_plan)(plan);
+	FFTW_API(destroy_plan)(iplan);
 }
 
 template <typename T>
@@ -1008,7 +855,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 
 {
 	int nx = f.size(0), ny = f.size(1), nz = f.size(2), nxp, nyp, nzp;
-	fftw_real *data;
+	real_t *data;
 	int xo = 0, yo = 0, zo = 0;
 	int nmax = std::max(nx, std::max(ny, nz));
 
@@ -1018,12 +865,12 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 
 	if (!periodic)
 	{
-		nxp = nmax + 2 * boundary; // 2*nmax;
-		nyp = nmax + 2 * boundary; // 2*nmax;
-		nzp = nmax + 2 * boundary; // 2*nmax;
-		xo = boundary;						 // nmax/2;
-		yo = boundary;						 // nmax/2;
-		zo = boundary;						 // nmax/2;
+		nxp = nmax + 2 * boundary; 
+		nyp = nmax + 2 * boundary; 
+		nzp = nmax + 2 * boundary; 
+		xo = boundary;						 
+		yo = boundary;						 
+		zo = boundary;						 
 	}
 	else
 	{
@@ -1032,17 +879,11 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 		nzp = nmax;
 	}
 
-	data = new fftw_real[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)];
+	data = new real_t[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)];
 
 	if (idir == 0)
 		std::cout << "   - Performing hybrid Poisson step... (" << nxp << ", " << nyp << ", " << nzp << ")\n";
 
-		// size_t N = (size_t)nxp*(size_t)nyp*2*((size_t)nzp/2+1);
-
-		// #pragma omp parallel for
-		// for( size_t i=0; i<N; ++i )
-		//	data[i]=0.0;
-
 #pragma omp parallel for
 	for (int i = 0; i < nxp; ++i)
 		for (int j = 0; j < nyp; ++j)
@@ -1097,7 +938,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
 /**************************************************************************************/
 /**************************************************************************************/
 
-template void poisson_hybrid<MeshvarBnd<double>>(MeshvarBnd<double> &f, int idir, int order, bool periodic, bool deconvolve_cic);
+template void poisson_hybrid<MeshvarBnd<real_t>>(MeshvarBnd<real_t> &f, int idir, int order, bool periodic, bool deconvolve_cic);
 template void poisson_hybrid<MeshvarBnd<float>>(MeshvarBnd<float> &f, int idir, int order, bool periodic, bool deconvolve_cic);
 
 namespace
diff --git a/src/solver.hh b/src/solver.hh
index 148eb55..aec5f37 100644
--- a/src/solver.hh
+++ b/src/solver.hh
@@ -7,23 +7,30 @@
  *
  */
 
-#ifndef __SOLVER_HH
-#define __SOLVER_HH
+#pragma once
 
 #include <cmath>
 #include <iostream>
-#include "mesh.hh"
+#include <mesh.hh>
 
-#define BEGIN_MULTIGRID_NAMESPACE namespace multigrid {
+#define BEGIN_MULTIGRID_NAMESPACE \
+	namespace multigrid             \
+	{
 #define END_MULTIGRID_NAMESPACE }
 
 BEGIN_MULTIGRID_NAMESPACE
-	
-namespace opt {
-	enum smtype { sm_jacobi, sm_gauss_seidel, sm_sor };
+
+namespace opt
+{
+	enum smtype
+	{
+		sm_jacobi,
+		sm_gauss_seidel,
+		sm_sor
+	};
 }
 
-template< class S, class O, typename T=double >
+template <class S, class O>
 class solver
 {
 public:
@@ -31,1080 +38,1023 @@ public:
 	typedef O mgop;
 
 protected:
-	scheme				m_scheme;
-	mgop				m_gridop;
-	unsigned			m_npresmooth, m_npostsmooth;
-	opt::smtype			m_smoother;
-	unsigned			m_ilevelmin;
-	
-	const static bool	m_bperiodic = true;
+	scheme m_scheme;
+	mgop m_gridop;
+	unsigned m_npresmooth, m_npostsmooth;
+	opt::smtype m_smoother;
+	unsigned m_ilevelmin;
 
-	GridHierarchy<T>	*m_pu, *m_pf, *m_pfsave;	
+	const static bool m_bperiodic = true;
+
+	GridHierarchy<real_t> *m_pu, *m_pf, *m_pfsave;
 	GridHierarchy<bool> *m_pmask;
-	const MeshvarBnd<T> *m_pubnd;
-	
-	double compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& unew );
-	
-	double compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& uhnew, bool verbose );
+	const MeshvarBnd<real_t> *m_pubnd;
+
+	double compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &unew);
+
+	double compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &uhnew, bool verbose);
 
 protected:
-	
-	void Jacobi( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
-	void GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
-	void SOR( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f );
-	
-	void twoGrid( unsigned ilevel );
-	
-	void interp_coarse_fine( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, bool bcf=true );
-	
-	void setBC( unsigned ilevel );
-	
-	void make_periodic( MeshvarBnd<T> *u );
-	
-	void interp_cubic( MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, int itop, int jtop, int ktop, int i, int j, int k );
-	void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, bool bcf );
-	
+	void Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
+	void GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
+	void SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
+
+	void twoGrid(unsigned ilevel);
+
+	void interp_coarse_fine(unsigned ilevel, MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, bool bcf = true);
+
+	void setBC(unsigned ilevel);
+
+	void make_periodic(MeshvarBnd<real_t> *u);
+
+	void interp_cubic(MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, int itop, int jtop, int ktop, int i, int j, int k);
+	void interp_coarse_fine_cubic(unsigned ilevel, MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, bool bcf);
+
 public:
-	solver( GridHierarchy<T>& f, //const MeshvarBnd<T>& uBC_top, 
-				   opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth );
-	
+	solver(GridHierarchy<real_t> &f, // const MeshvarBnd<real_t>& uBC_top,
+				 opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth);
+
 	~solver()
-	{ delete m_pmask; }
-	
-	double solve( GridHierarchy<T>& u, double accuracy, double h=-1.0, bool verbose=false );
-	
-	double solve( GridHierarchy<T>& u, double accuracy, bool verbose=false )
 	{
-		return this->solve ( u, accuracy, -1.0, verbose );
+		delete m_pmask;
+	}
+
+	double solve(GridHierarchy<real_t> &u, double accuracy, double h = -1.0, bool verbose = false);
+
+	double solve(GridHierarchy<real_t> &u, double accuracy, bool verbose = false)
+	{
+		return this->solve(u, accuracy, -1.0, verbose);
 	}
-	
-	
-	
 };
 
-
-template< class S, class O, typename T >
-solver<S,O,T>::solver( GridHierarchy<T>& f, //const MeshvarBnd<T>& ubnd, 
-					opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth )
-:	m_scheme(), m_gridop(), m_npresmooth( npresmooth ), m_npostsmooth( npostsmooth ), 
-m_smoother( smoother ), m_ilevelmin( f.levelmin() ), m_pf( &f )//, m_pubnd( &ubnd )
-{ 
+template <class S, class O>
+solver<S, O>::solver(GridHierarchy<real_t> &f, // const MeshvarBnd<real_t>& ubnd,
+										 opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth)
+		: m_scheme(), m_gridop(), m_npresmooth(npresmooth), m_npostsmooth(npostsmooth),
+			m_smoother(smoother), m_ilevelmin(f.levelmin()), m_pf(&f) //, m_pubnd( &ubnd )
+{
 	//... initialize the refinement mask
-	m_pmask = new GridHierarchy<bool>( f.m_nbnd );
+	m_pmask = new GridHierarchy<bool>(f.m_nbnd);
 	m_pmask->create_base_hierarchy(f.levelmin());
-			
-	for( unsigned ilevel=f.levelmin()+1; ilevel<=f.levelmax(); ++ilevel )
+
+	for (unsigned ilevel = f.levelmin() + 1; ilevel <= f.levelmax(); ++ilevel)
 	{
-		meshvar_bnd* pf = f.get_grid(ilevel);
-		m_pmask->add_patch( pf->offset(0), pf->offset(1), pf->offset(2), pf->size(0), pf->size(1), pf->size(2) );
+		meshvar_bnd *pf = f.get_grid(ilevel);
+		m_pmask->add_patch(pf->offset(0), pf->offset(1), pf->offset(2), pf->size(0), pf->size(1), pf->size(2));
 	}
-	
+
 	m_pmask->zero();
-	
-	for( unsigned ilevel=0; ilevel<f.levelmin(); ++ilevel )
+
+	for (unsigned ilevel = 0; ilevel < f.levelmin(); ++ilevel)
 	{
-		MeshvarBnd<T> *pf = f.get_grid(ilevel);
-		for( int ix=0; ix < (int)pf->size(0); ++ix )
-			for( int iy=0; iy < (int)pf->size(1); ++iy )
-				for( int iz=0; iz < (int)pf->size(2); ++iz )
-					(*m_pmask->get_grid(ilevel))(ix,iy,iz) = true;
+		MeshvarBnd<real_t> *pf = f.get_grid(ilevel);
+		for (int ix = 0; ix < (int)pf->size(0); ++ix)
+			for (int iy = 0; iy < (int)pf->size(1); ++iy)
+				for (int iz = 0; iz < (int)pf->size(2); ++iz)
+					(*m_pmask->get_grid(ilevel))(ix, iy, iz) = true;
 	}
-	
-	for( unsigned ilevel=m_ilevelmin; ilevel<f.levelmax(); ++ilevel )
+
+	for (unsigned ilevel = m_ilevelmin; ilevel < f.levelmax(); ++ilevel)
 	{
-		MeshvarBnd<T>* pf = f.get_grid(ilevel+1);//, *pfc = f.get_grid(ilevel);
-		
-		for( int ix=pf->offset(0); ix < (int)(pf->offset(0)+pf->size(0)/2); ++ix )
-			for( int iy=pf->offset(1); iy < (int)(pf->offset(1)+pf->size(1)/2); ++iy )
-				for( int iz=pf->offset(2); iz < (int)(pf->offset(2)+pf->size(2)/2); ++iz )
-					(*m_pmask->get_grid(ilevel))(ix,iy,iz) = true;
+		MeshvarBnd<real_t> *pf = f.get_grid(ilevel + 1); //, *pfc = f.get_grid(ilevel);
+
+		for (int ix = pf->offset(0); ix < (int)(pf->offset(0) + pf->size(0) / 2); ++ix)
+			for (int iy = pf->offset(1); iy < (int)(pf->offset(1) + pf->size(1) / 2); ++iy)
+				for (int iz = pf->offset(2); iz < (int)(pf->offset(2) + pf->size(2) / 2); ++iz)
+					(*m_pmask->get_grid(ilevel))(ix, iy, iz) = true;
 	}
-		
 }
 
-
-template< class S, class O, typename T >
-void solver<S,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
+template <class S, class O>
+void solver<S, O>::Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-	
-	double 
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
-	MeshvarBnd<T> uold(*u);
-	
-	double alpha = 0.95, ialpha = 1.0-alpha;
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
-	
-	
-	
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
+	real_t
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	MeshvarBnd<real_t> uold(*u);
+
+	real_t alpha = 0.95, ialpha = 1.0 - alpha;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-template< class S, class O, typename T >
-void solver<S,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
+template <class S, class O>
+void solver<S, O>::SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
 
-	double 
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-		
-	MeshvarBnd<T> uold(*u);
-	
-	double 
-		alpha = 1.2, 
-	//alpha = 2 / (1 + 4 * atan(1.0) / double(u->size(0)))-1.0,
-		ialpha = 1.0-alpha;
-	
-	//std::cerr << "omega_opt = " << alpha << std::endl;
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				if( (ix+iy+iz)%2==0 )
-					(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
-	
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				if( (ix+iy+iz)%2!=0 )
-					(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
-	
-	
+	real_t
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	MeshvarBnd<real_t> uold(*u);
+
+	real_t
+			alpha = 1.2,
+			ialpha = 1.0 - alpha;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if ((ix + iy + iz) % 2 == 0)
+					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if ((ix + iy + iz) % 2 != 0)
+					(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-template< class S, class O, typename T >
-void solver<S,O,T>::GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f )
+template <class S, class O>
+void solver<S, O>::GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
 {
-	int 
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-	
+	int
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
 	T
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
-	for( int color=0; color < 2; ++color )
-		#pragma omp parallel for
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
-					if( (ix+iy+iz)%2 == color )
-						(*u)(ix,iy,iz) = (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
-	
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
+	for (int color = 0; color < 2; ++color)
+#pragma omp parallel for
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+					if ((ix + iy + iz) % 2 == color)
+						(*u)(ix, iy, iz) = (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
 }
 
-
-template< class S, class O, typename T >
-void solver<S,O,T>::twoGrid( unsigned ilevel )
+template <class S, class O>
+void solver<S, O>::twoGrid(unsigned ilevel)
 {
-	MeshvarBnd<T> *uf, *uc, *ff, *fc;
-	
-	T 
-		h = 1.0/(pow(2.0,ilevel)),
-		c0 = -1.0/m_scheme.ccoeff(),
-		h2 = h*h; 
-	
+	MeshvarBnd<real_t> *uf, *uc, *ff, *fc;
+
+	real_t
+			h = 1.0 / (pow(2.0, ilevel)),
+			c0 = -1.0 / m_scheme.ccoeff(),
+			h2 = h * h;
+
 	uf = m_pu->get_grid(ilevel);
-	ff = m_pf->get_grid(ilevel);	
-	
-	uc = m_pu->get_grid(ilevel-1);
-	fc = m_pf->get_grid(ilevel-1);	
-	
-	int 
-		nx = uf->size(0), 
-		ny = uf->size(1), 
-		nz = uf->size(2);
-	
-	if( m_bperiodic && ilevel <= m_ilevelmin)
-		make_periodic( uf );
-	else if(!m_bperiodic)
-		setBC( ilevel );
-	
+	ff = m_pf->get_grid(ilevel);
+
+	uc = m_pu->get_grid(ilevel - 1);
+	fc = m_pf->get_grid(ilevel - 1);
+
+	int
+			nx = uf->size(0),
+			ny = uf->size(1),
+			nz = uf->size(2);
+
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uf);
+	else if (!m_bperiodic)
+		setBC(ilevel);
+
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npresmooth; ++i ){
-		
-		if( ilevel > m_ilevelmin )
-			interp_coarse_fine(ilevel, *uc, *uf );
-		
-		if( m_smoother == opt::sm_gauss_seidel )
-			GaussSeidel( h, uf, ff );
-			
-		else if( m_smoother == opt::sm_jacobi )
-			Jacobi( h, uf, ff);		
-			
-		else if( m_smoother == opt::sm_sor )
-			SOR( h, uf, ff );
-		
-		if( m_bperiodic && ilevel <= m_ilevelmin )
-			make_periodic( uf );
+	for (unsigned i = 0; i < m_npresmooth; ++i)
+	{
+
+		if (ilevel > m_ilevelmin)
+			interp_coarse_fine(ilevel, *uc, *uf);
+
+		if (m_smoother == opt::sm_gauss_seidel)
+			GaussSeidel(h, uf, ff);
+
+		else if (m_smoother == opt::sm_jacobi)
+			Jacobi(h, uf, ff);
+
+		else if (m_smoother == opt::sm_sor)
+			SOR(h, uf, ff);
+
+		if (m_bperiodic && ilevel <= m_ilevelmin)
+			make_periodic(uf);
 	}
-			
-	
-	m_gridop.restrict( *uf, *uc );
-	
+
+	m_gridop.restrict(*uf, *uc);
+
 	//... essential!!
-	if( m_bperiodic && ilevel <= m_ilevelmin )
-		make_periodic( uc );
-	else if( m_bperiodic )
-		interp_coarse_fine(ilevel,*uc,*uf);
-	
-	meshvar_bnd Lu(*uf,false);
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uc);
+	else if (m_bperiodic)
+		interp_coarse_fine(ilevel, *uc, *uf);
+
+	meshvar_bnd Lu(*uf, false);
 	Lu.zero();
-	#pragma omp parallel for
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				Lu(ix,iy,iz) = m_scheme.apply( (*uf), ix, iy, iz )/h2;
-	
-	meshvar_bnd tLu(*uc,false);
-	
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				Lu(ix, iy, iz) = m_scheme.apply((*uf), ix, iy, iz) / h2;
+
+	meshvar_bnd tLu(*uc, false);
+
 	//... restrict Lu
-	m_gridop.restrict( Lu, tLu );
+	m_gridop.restrict(Lu, tLu);
 	Lu.deallocate();
-	
+
 	//... restrict source term
-	m_gridop.restrict( *ff, *fc );
-	
-	//... compute RHS tau-correction
-	#pragma omp parallel for schedule(dynamic)
-	for( int ix=0; ix<(int)uc->size(0); ++ix )
-		for( int iy=0; iy<(int)uc->size(1); ++iy )
-			for( int iz=0; iz<(int)uc->size(2); ++iz )
-				if( (*m_pmask->get_grid(ilevel-1))(ix,iy,iz) == true )
-					(*fc)(ix,iy,iz) += ((tLu( ix, iy, iz ) - (m_scheme.apply( *uc, ix, iy, iz )/(4.0*h2))));
-				
-					
+	m_gridop.restrict(*ff, *fc);
+
+//... compute RHS tau-correction
+#pragma omp parallel for schedule(dynamic)
+	for (int ix = 0; ix < (int)uc->size(0); ++ix)
+		for (int iy = 0; iy < (int)uc->size(1); ++iy)
+			for (int iz = 0; iz < (int)uc->size(2); ++iz)
+				if ((*m_pmask->get_grid(ilevel - 1))(ix, iy, iz) == true)
+					(*fc)(ix, iy, iz) += ((tLu(ix, iy, iz) - (m_scheme.apply(*uc, ix, iy, iz) / (4.0 * h2))));
+
 	tLu.deallocate();
-	
-	meshvar_bnd ucsave(*uc,true);
-						
+
+	meshvar_bnd ucsave(*uc, true);
+
 	//... have we reached the end of the recursion or do we need to go up one level?
-	if( ilevel == 1 )
-		if( m_bperiodic )
-			(*uc)(0,0,0) = 0.0;
-		else 
-			(*uc)(0,0,0) = (m_scheme.rhs( (*uc), 0, 0, 0 ) + 4.0 * h2 * (*fc)(0,0,0))*c0;
+	if (ilevel == 1)
+		if (m_bperiodic)
+			(*uc)(0, 0, 0) = 0.0;
+		else
+			(*uc)(0, 0, 0) = (m_scheme.rhs((*uc), 0, 0, 0) + 4.0 * h2 * (*fc)(0, 0, 0)) * c0;
 	else
-		twoGrid( ilevel-1 );
-	
-	meshvar_bnd cc(*uc,false);
-	
-	//... compute correction on coarse grid
-	#pragma omp parallel for
-	for( int ix=0; ix<(int)cc.size(0); ++ix )
-		for( int iy=0; iy<(int)cc.size(1); ++iy )
-			for( int iz=0; iz<(int)cc.size(2); ++iz )
-				cc(ix,iy,iz) = (*uc)(ix,iy,iz) - ucsave(ix,iy,iz);
-		
+		twoGrid(ilevel - 1);
+
+	meshvar_bnd cc(*uc, false);
+
+//... compute correction on coarse grid
+#pragma omp parallel for
+	for (int ix = 0; ix < (int)cc.size(0); ++ix)
+		for (int iy = 0; iy < (int)cc.size(1); ++iy)
+			for (int iz = 0; iz < (int)cc.size(2); ++iz)
+				cc(ix, iy, iz) = (*uc)(ix, iy, iz) - ucsave(ix, iy, iz);
+
 	ucsave.deallocate();
 
+	//... prolongate correction to fine grid
+	meshvar_bnd cf(*uf, false);
+	m_gridop.prolong(cc, cf);
 
-	//... prolongate correction to fine grid	
-	meshvar_bnd cf(*uf,false);
-	m_gridop.prolong( cc, cf );
-	
 	cc.deallocate();
-	
-	
-	#pragma omp parallel for 
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				(*uf)(ix,iy,iz) += cf(ix,iy,iz);
-	
+
+#pragma omp parallel for
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				(*uf)(ix, iy, iz) += cf(ix, iy, iz);
 
 	cf.deallocate();
-				
+
 	//... interpolate and apply coarse-fine boundary conditions on fine level
-	if( m_bperiodic && ilevel <= m_ilevelmin )
-		make_periodic( uf );
-	else if(!m_bperiodic)
-		setBC( ilevel );
-	
-	//if( ilevel > m_ilevelmin )
+	if (m_bperiodic && ilevel <= m_ilevelmin)
+		make_periodic(uf);
+	else if (!m_bperiodic)
+		setBC(ilevel);
+
+	// if( ilevel > m_ilevelmin )
 	//	interp_coarse_fine(ilevel, *uc, *uf );
 
 	//... do smoothing sweeps with specified solver
-	for( unsigned i=0; i<m_npostsmooth; ++i ){
-		
-		if( ilevel > m_ilevelmin )
-			interp_coarse_fine(ilevel, *uc, *uf );
+	for (unsigned i = 0; i < m_npostsmooth; ++i)
+	{
 
-		if( m_smoother == opt::sm_gauss_seidel )
-			GaussSeidel( h, uf, ff );
-		
-		else if( m_smoother == opt::sm_jacobi )
-			Jacobi( h, uf, ff);		
-		
-		else if( m_smoother == opt::sm_sor )
-			SOR( h, uf, ff );
-		
-		if( m_bperiodic && ilevel <= m_ilevelmin )
-			make_periodic( uf );
+		if (ilevel > m_ilevelmin)
+			interp_coarse_fine(ilevel, *uc, *uf);
 
+		if (m_smoother == opt::sm_gauss_seidel)
+			GaussSeidel(h, uf, ff);
+
+		else if (m_smoother == opt::sm_jacobi)
+			Jacobi(h, uf, ff);
+
+		else if (m_smoother == opt::sm_sor)
+			SOR(h, uf, ff);
+
+		if (m_bperiodic && ilevel <= m_ilevelmin)
+			make_periodic(uf);
 	}
 }
 
-template< class S, class O, typename T >
-double solver<S,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& unew )
+template <class S, class O>
+double solver<S, O>::compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &unew)
 {
-	int 
-		nx = u.size(0), 
-		ny = u.size(1), 
-		nz = u.size(2);
-	
+	int
+			nx = u.size(0),
+			ny = u.size(1),
+			nz = u.size(2);
+
 	double err = 0.0;
 	unsigned count = 0;
-	
-#pragma omp parallel for reduction(+:err,count)
-	for( int ix=0; ix<nx; ++ix )
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-				if( fabs(unew(ix,iy,iz)) > 0.0 )//&& u(ix,iy,iz) != unew(ix,iy,iz) )
+
+#pragma omp parallel for reduction(+ \
+																	 : err, count)
+	for (int ix = 0; ix < nx; ++ix)
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+				if (fabs(unew(ix, iy, iz)) > 0.0) //&& u(ix,iy,iz) != unew(ix,iy,iz) )
 				{
-					err += fabs(1.0 - u(ix,iy,iz)/unew(ix,iy,iz));
+					err += fabs(1.0 - u(ix, iy, iz) / unew(ix, iy, iz));
 					++count;
 				}
-	
-	if( count != 0 )
+
+	if (count != 0)
 		err /= count;
-	
+
 	return err;
 }
 
-template< class S, class O, typename T >
-double solver<S,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& uhnew, bool verbose )
+template <class S, class O>
+double solver<S, O>::compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &uhnew, bool verbose)
 {
 	double maxerr = 0.0;
-	
-	for( unsigned ilevel=uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel )
+
+	for (unsigned ilevel = uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel)
 	{
 		double err = 0.0;
-		err = compute_error( *uh.get_grid(ilevel), *uhnew.get_grid(ilevel) );
-		
-		if( verbose )
+		err = compute_error(*uh.get_grid(ilevel), *uhnew.get_grid(ilevel));
+
+		if (verbose)
 			std::cout << "    Level " << std::setw(6) << ilevel << ",   Error = " << err << std::endl;
-		maxerr = std::max(maxerr,err);
-		
+		maxerr = std::max(maxerr, err);
 	}
 	return maxerr;
 }
 
-template< class S, class O, typename T >
-double solver<S,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool verbose )
+template <class S, class O, typename T>
+double solver<S, O>::solve(GridHierarchy<real_t> &uh, double acc, double h, bool verbose)
 {
 
 	double err;
-	
-	GridHierarchy<T> uhnew(uh);//, fsave(*m_pf);
+
+	GridHierarchy<real_t> uhnew(uh); //, fsave(*m_pf);
 	m_pu = &uh;
-	
-    unsigned niter = 0;
-	
+
+	unsigned niter = 0;
+
 	//... iterate ...//
 	while (true)
 	{
-		
-		
-		twoGrid( uh.levelmax() );
-		err = compute_error( *m_pu, uhnew, verbose );
+
+		twoGrid(uh.levelmax());
+		err = compute_error(*m_pu, uhnew, verbose);
 		++niter;
-		
-		if( verbose ){
+
+		if (verbose)
+		{
 			std::cout << "--> Step No. " << std::setw(3) << niter << ", Max Err = " << err << std::endl;
 			std::cout << "-------------------------------------------------------------\n";
 		}
-			
-		if( (niter > 1) && ((err < acc) || (niter > 20)) )
+
+		if ((niter > 1) && ((err < acc) || (niter > 20)))
 			break;
-		
+
 		uhnew = *m_pu;
 		//*m_pf = fsave;
-	}		
-	
-	if( err > acc )
+	}
+
+	if (err > acc)
 		std::cout << "Error : no convergence in Poisson solver" << std::endl;
-	else if( verbose )
+	else if (verbose)
 		std::cout << " - Converged in " << niter << " steps to req. acc. of " << acc << std::endl;
 
-	
-	//uh = uhnew;
+	// uh = uhnew;
 	//*m_pf = fsave;
 	return err;
 }
 
-inline double interp2( double x1, double x2, double x3, double f1, double f2, double f3, double x )
+inline double interp2(double x1, double x2, double x3, double f1, double f2, double f3, double x)
 {
-	double a,b,c;	
+	double a, b, c;
 	a = (x1 * f3 - x3 * f1 - x2 * f3 - x1 * f2 + x2 * f1 + x3 * f2) / (x1 * x3 * x3 - x2 * x3 * x3 + x2 * x1 * x1 - x3 * x1 * x1 + x3 * x2 * x2 - x1 * x2 * x2);
 	b = -(x1 * x1 * f3 - x1 * x1 * f2 - f1 * x3 * x3 + f2 * x3 * x3 - x2 * x2 * f3 + f1 * x2 * x2) / (x1 - x2) / (x1 * x2 - x1 * x3 + x3 * x3 - x2 * x3);
 	c = (x1 * x1 * x2 * f3 - x1 * x1 * x3 * f2 - x2 * x2 * x1 * f3 + f2 * x1 * x3 * x3 + x2 * x2 * x3 * f1 - f1 * x2 * x3 * x3) / (x1 - x2) / (x1 * x2 - x1 * x3 + x3 * x3 - x2 * x3);
-	
-	return a*x*x+b*x+c;
+
+	return a * x * x + b * x + c;
 }
 
-inline double interp2( double fleft, double fcenter, double fright, double x )
+inline double interp2(double fleft, double fcenter, double fright, double x)
 {
-	double a,b,c;
-	a = 0.5*(fleft+fright)-fcenter;
-	b = 0.5*(fright-fleft);
+	double a, b, c;
+	a = 0.5 * (fleft + fright) - fcenter;
+	b = 0.5 * (fright - fleft);
 	c = fcenter;
-	
-	return a*x*x+b*x+c;
+
+	return a * x * x + b * x + c;
 }
 
-
-inline double interp2left( double fleft, double fcenter, double fright )
+inline double interp2left(double fleft, double fcenter, double fright)
 {
-	double a,b,c;
-	a = (6.0*fright-10.0*fcenter+4.0*fleft)/15.0;
-	b = (-4.0*fleft+9.0*fright-5.0*fcenter)/15.0;
+	double a, b, c;
+	a = (6.0 * fright - 10.0 * fcenter + 4.0 * fleft) / 15.0;
+	b = (-4.0 * fleft + 9.0 * fright - 5.0 * fcenter) / 15.0;
 	c = fcenter;
-	
-	return a-b+c;
+
+	return a - b + c;
 }
 
-inline double interp2right( double fleft, double fcenter, double fright )
+inline double interp2right(double fleft, double fcenter, double fright)
 {
-	double a,b,c;
-	a = (6.0*fleft-10.0*fcenter+4.0*fright)/15.0;
-	b = (4.0*fright-9.0*fleft+5.0*fcenter)/15.0;
+	double a, b, c;
+	a = (6.0 * fleft - 10.0 * fcenter + 4.0 * fright) / 15.0;
+	b = (4.0 * fright - 9.0 * fleft + 5.0 * fcenter) / 15.0;
 	c = fcenter;
-	
-	return a+b+c;
+
+	return a + b + c;
 }
 
-template< class S, class O, typename T >
-void solver<S,O,T>::interp_cubic( MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, int i, int j, int k, int itop, int jtop, int ktop )
+template <class S, class O>
+void solver<S, O>::interp_cubic(MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, int i, int j, int k, int itop, int jtop, int ktop)
 {
-	MeshvarBnd<T> &u    = fine;
-	MeshvarBnd<T> &utop = coarse;
-	
+	MeshvarBnd<real_t> &u = fine;
+	MeshvarBnd<real_t> &utop = coarse;
+
 	/*
-	u(i+0,j+0,k+0) = ( -125.*utop(itop-2,jtop-2,ktop-2) +875.*utop(itop-2,jtop-2,ktop-1) +2625.*utop(itop-2,jtop-2,ktop) 
-					  -175.*utop(itop-2,jtop-2,ktop+1) +875.*utop(itop-2,jtop-1,ktop-2) -6125.*utop(itop-2,jtop-1,ktop-1) 
-					  -18375.*utop(itop-2,jtop-1,ktop) +1225.*utop(itop-2,jtop-1,ktop+1) +2625.*utop(itop-2,jtop,ktop-2) 
-					  -18375.*utop(itop-2,jtop,ktop-1) -55125.*utop(itop-2,jtop,ktop) +3675.*utop(itop-2,jtop,ktop+1) 
-					  -175.*utop(itop-2,jtop+1,ktop-2) +1225.*utop(itop-2,jtop+1,ktop-1) +3675.*utop(itop-2,jtop+1,ktop) 
-					  -245.*utop(itop-2,jtop+1,ktop+1) +875.*utop(itop-1,jtop-2,ktop-2) -6125.*utop(itop-1,jtop-2,ktop-1) 
-					  -18375.*utop(itop-1,jtop-2,ktop) +1225.*utop(itop-1,jtop-2,ktop+1) -6125.*utop(itop-1,jtop-1,ktop-2) 
-					  +42875.*utop(itop-1,jtop-1,ktop-1) +128625.*utop(itop-1,jtop-1,ktop) -8575.*utop(itop-1,jtop-1,ktop+1) 
-					  -18375.*utop(itop-1,jtop,ktop-2) +128625.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop) 
-					  -25725.*utop(itop-1,jtop,ktop+1) +1225.*utop(itop-1,jtop+1,ktop-2) -8575.*utop(itop-1,jtop+1,ktop-1) 
-					  -25725.*utop(itop-1,jtop+1,ktop) +1715.*utop(itop-1,jtop+1,ktop+1) +2625.*utop(itop,jtop-2,ktop-2) 
-					  -18375.*utop(itop,jtop-2,ktop-1) -55125.*utop(itop,jtop-2,ktop) +3675.*utop(itop,jtop-2,ktop+1) 
-					  -18375.*utop(itop,jtop-1,ktop-2) +128625.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) 
-					  -25725.*utop(itop,jtop-1,ktop+1) -55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) 
-					  +1157625.*utop(itop,jtop,ktop) -77175.*utop(itop,jtop,ktop+1) +3675.*utop(itop,jtop+1,ktop-2) 
-					  -25725.*utop(itop,jtop+1,ktop-1) -77175.*utop(itop,jtop+1,ktop) +5145.*utop(itop,jtop+1,ktop+1) 
-					  -175.*utop(itop+1,jtop-2,ktop-2) +1225.*utop(itop+1,jtop-2,ktop-1) +3675.*utop(itop+1,jtop-2,ktop) 
-					  -245.*utop(itop+1,jtop-2,ktop+1) +1225.*utop(itop+1,jtop-1,ktop-2) -8575.*utop(itop+1,jtop-1,ktop-1) 
-					  -25725.*utop(itop+1,jtop-1,ktop) +1715.*utop(itop+1,jtop-1,ktop+1) +3675.*utop(itop+1,jtop,ktop-2) 
-					  -25725.*utop(itop+1,jtop,ktop-1) -77175.*utop(itop+1,jtop,ktop) +5145.*utop(itop+1,jtop,ktop+1) 
-					  -245.*utop(itop+1,jtop+1,ktop-2) +1715.*utop(itop+1,jtop+1,ktop-1) +5145.*utop(itop+1,jtop+1,ktop) 
-					  -343.*utop(itop+1,jtop+1,ktop+1) )/2097152.;
-	u(i+0,j+0,k+1) = ( -175.*utop(itop-2,jtop-2,ktop-1) +2625.*utop(itop-2,jtop-2,ktop) +875.*utop(itop-2,jtop-2,ktop+1) 
-					  -125.*utop(itop-2,jtop-2,ktop+2) +1225.*utop(itop-2,jtop-1,ktop-1) -18375.*utop(itop-2,jtop-1,ktop) 
-					  -6125.*utop(itop-2,jtop-1,ktop+1) +875.*utop(itop-2,jtop-1,ktop+2) +3675.*utop(itop-2,jtop,ktop-1) 
-					  -55125.*utop(itop-2,jtop,ktop) -18375.*utop(itop-2,jtop,ktop+1) +2625.*utop(itop-2,jtop,ktop+2) 
-					  -245.*utop(itop-2,jtop+1,ktop-1) +3675.*utop(itop-2,jtop+1,ktop) +1225.*utop(itop-2,jtop+1,ktop+1) 
-					  -175.*utop(itop-2,jtop+1,ktop+2) +1225.*utop(itop-1,jtop-2,ktop-1) -18375.*utop(itop-1,jtop-2,ktop) 
-					  -6125.*utop(itop-1,jtop-2,ktop+1) +875.*utop(itop-1,jtop-2,ktop+2) -8575.*utop(itop-1,jtop-1,ktop-1) 
-					  +128625.*utop(itop-1,jtop-1,ktop) +42875.*utop(itop-1,jtop-1,ktop+1) -6125.*utop(itop-1,jtop-1,ktop+2) 
-					  -25725.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop) +128625.*utop(itop-1,jtop,ktop+1) 
-					  -18375.*utop(itop-1,jtop,ktop+2) +1715.*utop(itop-1,jtop+1,ktop-1) -25725.*utop(itop-1,jtop+1,ktop) 
-					  -8575.*utop(itop-1,jtop+1,ktop+1) +1225.*utop(itop-1,jtop+1,ktop+2) +3675.*utop(itop,jtop-2,ktop-1) 
-					  -55125.*utop(itop,jtop-2,ktop) -18375.*utop(itop,jtop-2,ktop+1) +2625.*utop(itop,jtop-2,ktop+2) 
-					  -25725.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) +128625.*utop(itop,jtop-1,ktop+1) 
-					  -18375.*utop(itop,jtop-1,ktop+2) -77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) 
-					  +385875.*utop(itop,jtop,ktop+1) -55125.*utop(itop,jtop,ktop+2) +5145.*utop(itop,jtop+1,ktop-1) 
-					  -77175.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +3675.*utop(itop,jtop+1,ktop+2) 
-					  -245.*utop(itop+1,jtop-2,ktop-1) +3675.*utop(itop+1,jtop-2,ktop) +1225.*utop(itop+1,jtop-2,ktop+1) 
-					  -175.*utop(itop+1,jtop-2,ktop+2) +1715.*utop(itop+1,jtop-1,ktop-1) -25725.*utop(itop+1,jtop-1,ktop) 
-					  -8575.*utop(itop+1,jtop-1,ktop+1) +1225.*utop(itop+1,jtop-1,ktop+2) +5145.*utop(itop+1,jtop,ktop-1) 
-					  -77175.*utop(itop+1,jtop,ktop) -25725.*utop(itop+1,jtop,ktop+1) +3675.*utop(itop+1,jtop,ktop+2) 
-					  -343.*utop(itop+1,jtop+1,ktop-1) +5145.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1) 
-					  -245.*utop(itop+1,jtop+1,ktop+2) )/2097152.;
-	u(i+0,j+1,k+0) = ( -175.*utop(itop-2,jtop-1,ktop-2) +1225.*utop(itop-2,jtop-1,ktop-1) +3675.*utop(itop-2,jtop-1,ktop) 
-					  -245.*utop(itop-2,jtop-1,ktop+1) +2625.*utop(itop-2,jtop,ktop-2) -18375.*utop(itop-2,jtop,ktop-1) 
-					  -55125.*utop(itop-2,jtop,ktop) +3675.*utop(itop-2,jtop,ktop+1) +875.*utop(itop-2,jtop+1,ktop-2) 
-					  -6125.*utop(itop-2,jtop+1,ktop-1) -18375.*utop(itop-2,jtop+1,ktop) +1225.*utop(itop-2,jtop+1,ktop+1) 
-					  -125.*utop(itop-2,jtop+2,ktop-2) +875.*utop(itop-2,jtop+2,ktop-1) +2625.*utop(itop-2,jtop+2,ktop) 
-					  -175.*utop(itop-2,jtop+2,ktop+1) +1225.*utop(itop-1,jtop-1,ktop-2) -8575.*utop(itop-1,jtop-1,ktop-1) 
-					  -25725.*utop(itop-1,jtop-1,ktop) +1715.*utop(itop-1,jtop-1,ktop+1) -18375.*utop(itop-1,jtop,ktop-2) 
-					  +128625.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop) -25725.*utop(itop-1,jtop,ktop+1) 
-					  -6125.*utop(itop-1,jtop+1,ktop-2) +42875.*utop(itop-1,jtop+1,ktop-1) +128625.*utop(itop-1,jtop+1,ktop) 
-					  -8575.*utop(itop-1,jtop+1,ktop+1) +875.*utop(itop-1,jtop+2,ktop-2) -6125.*utop(itop-1,jtop+2,ktop-1) 
-					  -18375.*utop(itop-1,jtop+2,ktop) +1225.*utop(itop-1,jtop+2,ktop+1) +3675.*utop(itop,jtop-1,ktop-2) 
-					  -25725.*utop(itop,jtop-1,ktop-1) -77175.*utop(itop,jtop-1,ktop) +5145.*utop(itop,jtop-1,ktop+1) 
-					  -55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) 
-					  -77175.*utop(itop,jtop,ktop+1) -18375.*utop(itop,jtop+1,ktop-2) +128625.*utop(itop,jtop+1,ktop-1) 
-					  +385875.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +2625.*utop(itop,jtop+2,ktop-2) 
-					  -18375.*utop(itop,jtop+2,ktop-1) -55125.*utop(itop,jtop+2,ktop) +3675.*utop(itop,jtop+2,ktop+1) 
-					  -245.*utop(itop+1,jtop-1,ktop-2) +1715.*utop(itop+1,jtop-1,ktop-1) +5145.*utop(itop+1,jtop-1,ktop) 
-					  -343.*utop(itop+1,jtop-1,ktop+1) +3675.*utop(itop+1,jtop,ktop-2) -25725.*utop(itop+1,jtop,ktop-1) 
-					  -77175.*utop(itop+1,jtop,ktop) +5145.*utop(itop+1,jtop,ktop+1) +1225.*utop(itop+1,jtop+1,ktop-2) 
-					  -8575.*utop(itop+1,jtop+1,ktop-1) -25725.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1) 
-					  -175.*utop(itop+1,jtop+2,ktop-2) +1225.*utop(itop+1,jtop+2,ktop-1) +3675.*utop(itop+1,jtop+2,ktop) 
-					  -245.*utop(itop+1,jtop+2,ktop+1) )/2097152.;
-	u(i+0,j+1,k+1) = ( -245.*utop(itop-2,jtop-1,ktop-1) +3675.*utop(itop-2,jtop-1,ktop) +1225.*utop(itop-2,jtop-1,ktop+1) 
-					  -175.*utop(itop-2,jtop-1,ktop+2) +3675.*utop(itop-2,jtop,ktop-1) -55125.*utop(itop-2,jtop,ktop) 
-					  -18375.*utop(itop-2,jtop,ktop+1) +2625.*utop(itop-2,jtop,ktop+2) +1225.*utop(itop-2,jtop+1,ktop-1) 
-					  -18375.*utop(itop-2,jtop+1,ktop) -6125.*utop(itop-2,jtop+1,ktop+1) +875.*utop(itop-2,jtop+1,ktop+2) 
-					  -175.*utop(itop-2,jtop+2,ktop-1) +2625.*utop(itop-2,jtop+2,ktop) +875.*utop(itop-2,jtop+2,ktop+1) 
-					  -125.*utop(itop-2,jtop+2,ktop+2) +1715.*utop(itop-1,jtop-1,ktop-1) -25725.*utop(itop-1,jtop-1,ktop) 
-					  -8575.*utop(itop-1,jtop-1,ktop+1) +1225.*utop(itop-1,jtop-1,ktop+2) -25725.*utop(itop-1,jtop,ktop-1) 
-					  +385875.*utop(itop-1,jtop,ktop) +128625.*utop(itop-1,jtop,ktop+1) -18375.*utop(itop-1,jtop,ktop+2) 
-					  -8575.*utop(itop-1,jtop+1,ktop-1) +128625.*utop(itop-1,jtop+1,ktop) +42875.*utop(itop-1,jtop+1,ktop+1) 
-					  -6125.*utop(itop-1,jtop+1,ktop+2) +1225.*utop(itop-1,jtop+2,ktop-1) -18375.*utop(itop-1,jtop+2,ktop) 
-					  -6125.*utop(itop-1,jtop+2,ktop+1) +875.*utop(itop-1,jtop+2,ktop+2) +5145.*utop(itop,jtop-1,ktop-1) 
-					  -77175.*utop(itop,jtop-1,ktop) -25725.*utop(itop,jtop-1,ktop+1) +3675.*utop(itop,jtop-1,ktop+2) 
-					  -77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) +385875.*utop(itop,jtop,ktop+1) 
-					  -55125.*utop(itop,jtop,ktop+2) -25725.*utop(itop,jtop+1,ktop-1) +385875.*utop(itop,jtop+1,ktop) 
-					  +128625.*utop(itop,jtop+1,ktop+1) -18375.*utop(itop,jtop+1,ktop+2) +3675.*utop(itop,jtop+2,ktop-1) 
-					  -55125.*utop(itop,jtop+2,ktop) -18375.*utop(itop,jtop+2,ktop+1) +2625.*utop(itop,jtop+2,ktop+2) 
-					  -343.*utop(itop+1,jtop-1,ktop-1) +5145.*utop(itop+1,jtop-1,ktop) +1715.*utop(itop+1,jtop-1,ktop+1) 
-					  -245.*utop(itop+1,jtop-1,ktop+2) +5145.*utop(itop+1,jtop,ktop-1) -77175.*utop(itop+1,jtop,ktop) 
-					  -25725.*utop(itop+1,jtop,ktop+1) +3675.*utop(itop+1,jtop,ktop+2) +1715.*utop(itop+1,jtop+1,ktop-1) 
-					  -25725.*utop(itop+1,jtop+1,ktop) -8575.*utop(itop+1,jtop+1,ktop+1) +1225.*utop(itop+1,jtop+1,ktop+2) 
-					  -245.*utop(itop+1,jtop+2,ktop-1) +3675.*utop(itop+1,jtop+2,ktop) +1225.*utop(itop+1,jtop+2,ktop+1) 
-					  -175.*utop(itop+1,jtop+2,ktop+2) )/2097152.;
-	u(i+1,j+0,k+0) = ( -175.*utop(itop-1,jtop-2,ktop-2) +1225.*utop(itop-1,jtop-2,ktop-1) +3675.*utop(itop-1,jtop-2,ktop) 
-					  -245.*utop(itop-1,jtop-2,ktop+1) +1225.*utop(itop-1,jtop-1,ktop-2) -8575.*utop(itop-1,jtop-1,ktop-1) 
-					  -25725.*utop(itop-1,jtop-1,ktop) +1715.*utop(itop-1,jtop-1,ktop+1) +3675.*utop(itop-1,jtop,ktop-2) 
-					  -25725.*utop(itop-1,jtop,ktop-1) -77175.*utop(itop-1,jtop,ktop) +5145.*utop(itop-1,jtop,ktop+1) 
-					  -245.*utop(itop-1,jtop+1,ktop-2) +1715.*utop(itop-1,jtop+1,ktop-1) +5145.*utop(itop-1,jtop+1,ktop) 
-					  -343.*utop(itop-1,jtop+1,ktop+1) +2625.*utop(itop,jtop-2,ktop-2) -18375.*utop(itop,jtop-2,ktop-1) 
-					  -55125.*utop(itop,jtop-2,ktop) +3675.*utop(itop,jtop-2,ktop+1) -18375.*utop(itop,jtop-1,ktop-2) 
-					  +128625.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) -25725.*utop(itop,jtop-1,ktop+1) 
-					  -55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) 
-					  -77175.*utop(itop,jtop,ktop+1) +3675.*utop(itop,jtop+1,ktop-2) -25725.*utop(itop,jtop+1,ktop-1) 
-					  -77175.*utop(itop,jtop+1,ktop) +5145.*utop(itop,jtop+1,ktop+1) +875.*utop(itop+1,jtop-2,ktop-2) 
-					  -6125.*utop(itop+1,jtop-2,ktop-1) -18375.*utop(itop+1,jtop-2,ktop) +1225.*utop(itop+1,jtop-2,ktop+1) 
-					  -6125.*utop(itop+1,jtop-1,ktop-2) +42875.*utop(itop+1,jtop-1,ktop-1) +128625.*utop(itop+1,jtop-1,ktop) 
-					  -8575.*utop(itop+1,jtop-1,ktop+1) -18375.*utop(itop+1,jtop,ktop-2) +128625.*utop(itop+1,jtop,ktop-1) 
-					  +385875.*utop(itop+1,jtop,ktop) -25725.*utop(itop+1,jtop,ktop+1) +1225.*utop(itop+1,jtop+1,ktop-2) 
-					  -8575.*utop(itop+1,jtop+1,ktop-1) -25725.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1) 
-					  -125.*utop(itop+2,jtop-2,ktop-2) +875.*utop(itop+2,jtop-2,ktop-1) +2625.*utop(itop+2,jtop-2,ktop) 
-					  -175.*utop(itop+2,jtop-2,ktop+1) +875.*utop(itop+2,jtop-1,ktop-2) -6125.*utop(itop+2,jtop-1,ktop-1) 
-					  -18375.*utop(itop+2,jtop-1,ktop) +1225.*utop(itop+2,jtop-1,ktop+1) +2625.*utop(itop+2,jtop,ktop-2) 
-					  -18375.*utop(itop+2,jtop,ktop-1) -55125.*utop(itop+2,jtop,ktop) +3675.*utop(itop+2,jtop,ktop+1) 
-					  -175.*utop(itop+2,jtop+1,ktop-2) +1225.*utop(itop+2,jtop+1,ktop-1) +3675.*utop(itop+2,jtop+1,ktop) 
-					  -245.*utop(itop+2,jtop+1,ktop+1) )/2097152.;
+	u(i+0,j+0,k+0) = ( -125.*utop(itop-2,jtop-2,ktop-2) +875.*utop(itop-2,jtop-2,ktop-1) +2625.*utop(itop-2,jtop-2,ktop)
+						-175.*utop(itop-2,jtop-2,ktop+1) +875.*utop(itop-2,jtop-1,ktop-2) -6125.*utop(itop-2,jtop-1,ktop-1)
+						-18375.*utop(itop-2,jtop-1,ktop) +1225.*utop(itop-2,jtop-1,ktop+1) +2625.*utop(itop-2,jtop,ktop-2)
+						-18375.*utop(itop-2,jtop,ktop-1) -55125.*utop(itop-2,jtop,ktop) +3675.*utop(itop-2,jtop,ktop+1)
+						-175.*utop(itop-2,jtop+1,ktop-2) +1225.*utop(itop-2,jtop+1,ktop-1) +3675.*utop(itop-2,jtop+1,ktop)
+						-245.*utop(itop-2,jtop+1,ktop+1) +875.*utop(itop-1,jtop-2,ktop-2) -6125.*utop(itop-1,jtop-2,ktop-1)
+						-18375.*utop(itop-1,jtop-2,ktop) +1225.*utop(itop-1,jtop-2,ktop+1) -6125.*utop(itop-1,jtop-1,ktop-2)
+						+42875.*utop(itop-1,jtop-1,ktop-1) +128625.*utop(itop-1,jtop-1,ktop) -8575.*utop(itop-1,jtop-1,ktop+1)
+						-18375.*utop(itop-1,jtop,ktop-2) +128625.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop)
+						-25725.*utop(itop-1,jtop,ktop+1) +1225.*utop(itop-1,jtop+1,ktop-2) -8575.*utop(itop-1,jtop+1,ktop-1)
+						-25725.*utop(itop-1,jtop+1,ktop) +1715.*utop(itop-1,jtop+1,ktop+1) +2625.*utop(itop,jtop-2,ktop-2)
+						-18375.*utop(itop,jtop-2,ktop-1) -55125.*utop(itop,jtop-2,ktop) +3675.*utop(itop,jtop-2,ktop+1)
+						-18375.*utop(itop,jtop-1,ktop-2) +128625.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop)
+						-25725.*utop(itop,jtop-1,ktop+1) -55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1)
+						+1157625.*utop(itop,jtop,ktop) -77175.*utop(itop,jtop,ktop+1) +3675.*utop(itop,jtop+1,ktop-2)
+						-25725.*utop(itop,jtop+1,ktop-1) -77175.*utop(itop,jtop+1,ktop) +5145.*utop(itop,jtop+1,ktop+1)
+						-175.*utop(itop+1,jtop-2,ktop-2) +1225.*utop(itop+1,jtop-2,ktop-1) +3675.*utop(itop+1,jtop-2,ktop)
+						-245.*utop(itop+1,jtop-2,ktop+1) +1225.*utop(itop+1,jtop-1,ktop-2) -8575.*utop(itop+1,jtop-1,ktop-1)
+						-25725.*utop(itop+1,jtop-1,ktop) +1715.*utop(itop+1,jtop-1,ktop+1) +3675.*utop(itop+1,jtop,ktop-2)
+						-25725.*utop(itop+1,jtop,ktop-1) -77175.*utop(itop+1,jtop,ktop) +5145.*utop(itop+1,jtop,ktop+1)
+						-245.*utop(itop+1,jtop+1,ktop-2) +1715.*utop(itop+1,jtop+1,ktop-1) +5145.*utop(itop+1,jtop+1,ktop)
+						-343.*utop(itop+1,jtop+1,ktop+1) )/2097152.;
+	u(i+0,j+0,k+1) = ( -175.*utop(itop-2,jtop-2,ktop-1) +2625.*utop(itop-2,jtop-2,ktop) +875.*utop(itop-2,jtop-2,ktop+1)
+						-125.*utop(itop-2,jtop-2,ktop+2) +1225.*utop(itop-2,jtop-1,ktop-1) -18375.*utop(itop-2,jtop-1,ktop)
+						-6125.*utop(itop-2,jtop-1,ktop+1) +875.*utop(itop-2,jtop-1,ktop+2) +3675.*utop(itop-2,jtop,ktop-1)
+						-55125.*utop(itop-2,jtop,ktop) -18375.*utop(itop-2,jtop,ktop+1) +2625.*utop(itop-2,jtop,ktop+2)
+						-245.*utop(itop-2,jtop+1,ktop-1) +3675.*utop(itop-2,jtop+1,ktop) +1225.*utop(itop-2,jtop+1,ktop+1)
+						-175.*utop(itop-2,jtop+1,ktop+2) +1225.*utop(itop-1,jtop-2,ktop-1) -18375.*utop(itop-1,jtop-2,ktop)
+						-6125.*utop(itop-1,jtop-2,ktop+1) +875.*utop(itop-1,jtop-2,ktop+2) -8575.*utop(itop-1,jtop-1,ktop-1)
+						+128625.*utop(itop-1,jtop-1,ktop) +42875.*utop(itop-1,jtop-1,ktop+1) -6125.*utop(itop-1,jtop-1,ktop+2)
+						-25725.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop) +128625.*utop(itop-1,jtop,ktop+1)
+						-18375.*utop(itop-1,jtop,ktop+2) +1715.*utop(itop-1,jtop+1,ktop-1) -25725.*utop(itop-1,jtop+1,ktop)
+						-8575.*utop(itop-1,jtop+1,ktop+1) +1225.*utop(itop-1,jtop+1,ktop+2) +3675.*utop(itop,jtop-2,ktop-1)
+						-55125.*utop(itop,jtop-2,ktop) -18375.*utop(itop,jtop-2,ktop+1) +2625.*utop(itop,jtop-2,ktop+2)
+						-25725.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) +128625.*utop(itop,jtop-1,ktop+1)
+						-18375.*utop(itop,jtop-1,ktop+2) -77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop)
+						+385875.*utop(itop,jtop,ktop+1) -55125.*utop(itop,jtop,ktop+2) +5145.*utop(itop,jtop+1,ktop-1)
+						-77175.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +3675.*utop(itop,jtop+1,ktop+2)
+						-245.*utop(itop+1,jtop-2,ktop-1) +3675.*utop(itop+1,jtop-2,ktop) +1225.*utop(itop+1,jtop-2,ktop+1)
+						-175.*utop(itop+1,jtop-2,ktop+2) +1715.*utop(itop+1,jtop-1,ktop-1) -25725.*utop(itop+1,jtop-1,ktop)
+						-8575.*utop(itop+1,jtop-1,ktop+1) +1225.*utop(itop+1,jtop-1,ktop+2) +5145.*utop(itop+1,jtop,ktop-1)
+						-77175.*utop(itop+1,jtop,ktop) -25725.*utop(itop+1,jtop,ktop+1) +3675.*utop(itop+1,jtop,ktop+2)
+						-343.*utop(itop+1,jtop+1,ktop-1) +5145.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1)
+						-245.*utop(itop+1,jtop+1,ktop+2) )/2097152.;
+	u(i+0,j+1,k+0) = ( -175.*utop(itop-2,jtop-1,ktop-2) +1225.*utop(itop-2,jtop-1,ktop-1) +3675.*utop(itop-2,jtop-1,ktop)
+						-245.*utop(itop-2,jtop-1,ktop+1) +2625.*utop(itop-2,jtop,ktop-2) -18375.*utop(itop-2,jtop,ktop-1)
+						-55125.*utop(itop-2,jtop,ktop) +3675.*utop(itop-2,jtop,ktop+1) +875.*utop(itop-2,jtop+1,ktop-2)
+						-6125.*utop(itop-2,jtop+1,ktop-1) -18375.*utop(itop-2,jtop+1,ktop) +1225.*utop(itop-2,jtop+1,ktop+1)
+						-125.*utop(itop-2,jtop+2,ktop-2) +875.*utop(itop-2,jtop+2,ktop-1) +2625.*utop(itop-2,jtop+2,ktop)
+						-175.*utop(itop-2,jtop+2,ktop+1) +1225.*utop(itop-1,jtop-1,ktop-2) -8575.*utop(itop-1,jtop-1,ktop-1)
+						-25725.*utop(itop-1,jtop-1,ktop) +1715.*utop(itop-1,jtop-1,ktop+1) -18375.*utop(itop-1,jtop,ktop-2)
+						+128625.*utop(itop-1,jtop,ktop-1) +385875.*utop(itop-1,jtop,ktop) -25725.*utop(itop-1,jtop,ktop+1)
+						-6125.*utop(itop-1,jtop+1,ktop-2) +42875.*utop(itop-1,jtop+1,ktop-1) +128625.*utop(itop-1,jtop+1,ktop)
+						-8575.*utop(itop-1,jtop+1,ktop+1) +875.*utop(itop-1,jtop+2,ktop-2) -6125.*utop(itop-1,jtop+2,ktop-1)
+						-18375.*utop(itop-1,jtop+2,ktop) +1225.*utop(itop-1,jtop+2,ktop+1) +3675.*utop(itop,jtop-1,ktop-2)
+						-25725.*utop(itop,jtop-1,ktop-1) -77175.*utop(itop,jtop-1,ktop) +5145.*utop(itop,jtop-1,ktop+1)
+						-55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop)
+						-77175.*utop(itop,jtop,ktop+1) -18375.*utop(itop,jtop+1,ktop-2) +128625.*utop(itop,jtop+1,ktop-1)
+						+385875.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +2625.*utop(itop,jtop+2,ktop-2)
+						-18375.*utop(itop,jtop+2,ktop-1) -55125.*utop(itop,jtop+2,ktop) +3675.*utop(itop,jtop+2,ktop+1)
+						-245.*utop(itop+1,jtop-1,ktop-2) +1715.*utop(itop+1,jtop-1,ktop-1) +5145.*utop(itop+1,jtop-1,ktop)
+						-343.*utop(itop+1,jtop-1,ktop+1) +3675.*utop(itop+1,jtop,ktop-2) -25725.*utop(itop+1,jtop,ktop-1)
+						-77175.*utop(itop+1,jtop,ktop) +5145.*utop(itop+1,jtop,ktop+1) +1225.*utop(itop+1,jtop+1,ktop-2)
+						-8575.*utop(itop+1,jtop+1,ktop-1) -25725.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1)
+						-175.*utop(itop+1,jtop+2,ktop-2) +1225.*utop(itop+1,jtop+2,ktop-1) +3675.*utop(itop+1,jtop+2,ktop)
+						-245.*utop(itop+1,jtop+2,ktop+1) )/2097152.;
+	u(i+0,j+1,k+1) = ( -245.*utop(itop-2,jtop-1,ktop-1) +3675.*utop(itop-2,jtop-1,ktop) +1225.*utop(itop-2,jtop-1,ktop+1)
+						-175.*utop(itop-2,jtop-1,ktop+2) +3675.*utop(itop-2,jtop,ktop-1) -55125.*utop(itop-2,jtop,ktop)
+						-18375.*utop(itop-2,jtop,ktop+1) +2625.*utop(itop-2,jtop,ktop+2) +1225.*utop(itop-2,jtop+1,ktop-1)
+						-18375.*utop(itop-2,jtop+1,ktop) -6125.*utop(itop-2,jtop+1,ktop+1) +875.*utop(itop-2,jtop+1,ktop+2)
+						-175.*utop(itop-2,jtop+2,ktop-1) +2625.*utop(itop-2,jtop+2,ktop) +875.*utop(itop-2,jtop+2,ktop+1)
+						-125.*utop(itop-2,jtop+2,ktop+2) +1715.*utop(itop-1,jtop-1,ktop-1) -25725.*utop(itop-1,jtop-1,ktop)
+						-8575.*utop(itop-1,jtop-1,ktop+1) +1225.*utop(itop-1,jtop-1,ktop+2) -25725.*utop(itop-1,jtop,ktop-1)
+						+385875.*utop(itop-1,jtop,ktop) +128625.*utop(itop-1,jtop,ktop+1) -18375.*utop(itop-1,jtop,ktop+2)
+						-8575.*utop(itop-1,jtop+1,ktop-1) +128625.*utop(itop-1,jtop+1,ktop) +42875.*utop(itop-1,jtop+1,ktop+1)
+						-6125.*utop(itop-1,jtop+1,ktop+2) +1225.*utop(itop-1,jtop+2,ktop-1) -18375.*utop(itop-1,jtop+2,ktop)
+						-6125.*utop(itop-1,jtop+2,ktop+1) +875.*utop(itop-1,jtop+2,ktop+2) +5145.*utop(itop,jtop-1,ktop-1)
+						-77175.*utop(itop,jtop-1,ktop) -25725.*utop(itop,jtop-1,ktop+1) +3675.*utop(itop,jtop-1,ktop+2)
+						-77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) +385875.*utop(itop,jtop,ktop+1)
+						-55125.*utop(itop,jtop,ktop+2) -25725.*utop(itop,jtop+1,ktop-1) +385875.*utop(itop,jtop+1,ktop)
+						+128625.*utop(itop,jtop+1,ktop+1) -18375.*utop(itop,jtop+1,ktop+2) +3675.*utop(itop,jtop+2,ktop-1)
+						-55125.*utop(itop,jtop+2,ktop) -18375.*utop(itop,jtop+2,ktop+1) +2625.*utop(itop,jtop+2,ktop+2)
+						-343.*utop(itop+1,jtop-1,ktop-1) +5145.*utop(itop+1,jtop-1,ktop) +1715.*utop(itop+1,jtop-1,ktop+1)
+						-245.*utop(itop+1,jtop-1,ktop+2) +5145.*utop(itop+1,jtop,ktop-1) -77175.*utop(itop+1,jtop,ktop)
+						-25725.*utop(itop+1,jtop,ktop+1) +3675.*utop(itop+1,jtop,ktop+2) +1715.*utop(itop+1,jtop+1,ktop-1)
+						-25725.*utop(itop+1,jtop+1,ktop) -8575.*utop(itop+1,jtop+1,ktop+1) +1225.*utop(itop+1,jtop+1,ktop+2)
+						-245.*utop(itop+1,jtop+2,ktop-1) +3675.*utop(itop+1,jtop+2,ktop) +1225.*utop(itop+1,jtop+2,ktop+1)
+						-175.*utop(itop+1,jtop+2,ktop+2) )/2097152.;
+	u(i+1,j+0,k+0) = ( -175.*utop(itop-1,jtop-2,ktop-2) +1225.*utop(itop-1,jtop-2,ktop-1) +3675.*utop(itop-1,jtop-2,ktop)
+						-245.*utop(itop-1,jtop-2,ktop+1) +1225.*utop(itop-1,jtop-1,ktop-2) -8575.*utop(itop-1,jtop-1,ktop-1)
+						-25725.*utop(itop-1,jtop-1,ktop) +1715.*utop(itop-1,jtop-1,ktop+1) +3675.*utop(itop-1,jtop,ktop-2)
+						-25725.*utop(itop-1,jtop,ktop-1) -77175.*utop(itop-1,jtop,ktop) +5145.*utop(itop-1,jtop,ktop+1)
+						-245.*utop(itop-1,jtop+1,ktop-2) +1715.*utop(itop-1,jtop+1,ktop-1) +5145.*utop(itop-1,jtop+1,ktop)
+						-343.*utop(itop-1,jtop+1,ktop+1) +2625.*utop(itop,jtop-2,ktop-2) -18375.*utop(itop,jtop-2,ktop-1)
+						-55125.*utop(itop,jtop-2,ktop) +3675.*utop(itop,jtop-2,ktop+1) -18375.*utop(itop,jtop-1,ktop-2)
+						+128625.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) -25725.*utop(itop,jtop-1,ktop+1)
+						-55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop)
+						-77175.*utop(itop,jtop,ktop+1) +3675.*utop(itop,jtop+1,ktop-2) -25725.*utop(itop,jtop+1,ktop-1)
+						-77175.*utop(itop,jtop+1,ktop) +5145.*utop(itop,jtop+1,ktop+1) +875.*utop(itop+1,jtop-2,ktop-2)
+						-6125.*utop(itop+1,jtop-2,ktop-1) -18375.*utop(itop+1,jtop-2,ktop) +1225.*utop(itop+1,jtop-2,ktop+1)
+						-6125.*utop(itop+1,jtop-1,ktop-2) +42875.*utop(itop+1,jtop-1,ktop-1) +128625.*utop(itop+1,jtop-1,ktop)
+						-8575.*utop(itop+1,jtop-1,ktop+1) -18375.*utop(itop+1,jtop,ktop-2) +128625.*utop(itop+1,jtop,ktop-1)
+						+385875.*utop(itop+1,jtop,ktop) -25725.*utop(itop+1,jtop,ktop+1) +1225.*utop(itop+1,jtop+1,ktop-2)
+						-8575.*utop(itop+1,jtop+1,ktop-1) -25725.*utop(itop+1,jtop+1,ktop) +1715.*utop(itop+1,jtop+1,ktop+1)
+						-125.*utop(itop+2,jtop-2,ktop-2) +875.*utop(itop+2,jtop-2,ktop-1) +2625.*utop(itop+2,jtop-2,ktop)
+						-175.*utop(itop+2,jtop-2,ktop+1) +875.*utop(itop+2,jtop-1,ktop-2) -6125.*utop(itop+2,jtop-1,ktop-1)
+						-18375.*utop(itop+2,jtop-1,ktop) +1225.*utop(itop+2,jtop-1,ktop+1) +2625.*utop(itop+2,jtop,ktop-2)
+						-18375.*utop(itop+2,jtop,ktop-1) -55125.*utop(itop+2,jtop,ktop) +3675.*utop(itop+2,jtop,ktop+1)
+						-175.*utop(itop+2,jtop+1,ktop-2) +1225.*utop(itop+2,jtop+1,ktop-1) +3675.*utop(itop+2,jtop+1,ktop)
+						-245.*utop(itop+2,jtop+1,ktop+1) )/2097152.;
 	u(i+1,j+0,k+1) = ( -245.*utop(itop-1,jtop-2,ktop-1) +3675.*utop(itop-1,jtop-2,ktop) +1225.*utop(itop-1,jtop-2,ktop+1) -175.*utop(itop-1,jtop-2,ktop+2) +1715.*utop(itop-1,jtop-1,ktop-1) -25725.*utop(itop-1,jtop-1,ktop) -8575.*utop(itop-1,jtop-1,ktop+1) +1225.*utop(itop-1,jtop-1,ktop+2) +5145.*utop(itop-1,jtop,ktop-1) -77175.*utop(itop-1,jtop,ktop) -25725.*utop(itop-1,jtop,ktop+1) +3675.*utop(itop-1,jtop,ktop+2) -343.*utop(itop-1,jtop+1,ktop-1) +5145.*utop(itop-1,jtop+1,ktop) +1715.*utop(itop-1,jtop+1,ktop+1) -245.*utop(itop-1,jtop+1,ktop+2) +3675.*utop(itop,jtop-2,ktop-1) -55125.*utop(itop,jtop-2,ktop) -18375.*utop(itop,jtop-2,ktop+1) +2625.*utop(itop,jtop-2,ktop+2) -25725.*utop(itop,jtop-1,ktop-1) +385875.*utop(itop,jtop-1,ktop) +128625.*utop(itop,jtop-1,ktop+1) -18375.*utop(itop,jtop-1,ktop+2) -77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) +385875.*utop(itop,jtop,ktop+1) -55125.*utop(itop,jtop,ktop+2) +5145.*utop(itop,jtop+1,ktop-1) -77175.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +3675.*utop(itop,jtop+1,ktop+2) +1225.*utop(itop+1,jtop-2,ktop-1) -18375.*utop(itop+1,jtop-2,ktop) -6125.*utop(itop+1,jtop-2,ktop+1) +875.*utop(itop+1,jtop-2,ktop+2) -8575.*utop(itop+1,jtop-1,ktop-1) +128625.*utop(itop+1,jtop-1,ktop) +42875.*utop(itop+1,jtop-1,ktop+1) -6125.*utop(itop+1,jtop-1,ktop+2) -25725.*utop(itop+1,jtop,ktop-1) +385875.*utop(itop+1,jtop,ktop) +128625.*utop(itop+1,jtop,ktop+1) -18375.*utop(itop+1,jtop,ktop+2) +1715.*utop(itop+1,jtop+1,ktop-1) -25725.*utop(itop+1,jtop+1,ktop) -8575.*utop(itop+1,jtop+1,ktop+1) +1225.*utop(itop+1,jtop+1,ktop+2) -175.*utop(itop+2,jtop-2,ktop-1) +2625.*utop(itop+2,jtop-2,ktop) +875.*utop(itop+2,jtop-2,ktop+1) -125.*utop(itop+2,jtop-2,ktop+2) +1225.*utop(itop+2,jtop-1,ktop-1) -18375.*utop(itop+2,jtop-1,ktop) -6125.*utop(itop+2,jtop-1,ktop+1) +875.*utop(itop+2,jtop-1,ktop+2) +3675.*utop(itop+2,jtop,ktop-1) -55125.*utop(itop+2,jtop,ktop) -18375.*utop(itop+2,jtop,ktop+1) +2625.*utop(itop+2,jtop,ktop+2) -245.*utop(itop+2,jtop+1,ktop-1) +3675.*utop(itop+2,jtop+1,ktop) +1225.*utop(itop+2,jtop+1,ktop+1) -175.*utop(itop+2,jtop+1,ktop+2) )/2097152.;
 	u(i+1,j+1,k+0) = ( -245.*utop(itop-1,jtop-1,ktop-2) +1715.*utop(itop-1,jtop-1,ktop-1) +5145.*utop(itop-1,jtop-1,ktop) -343.*utop(itop-1,jtop-1,ktop+1) +3675.*utop(itop-1,jtop,ktop-2) -25725.*utop(itop-1,jtop,ktop-1) -77175.*utop(itop-1,jtop,ktop) +5145.*utop(itop-1,jtop,ktop+1) +1225.*utop(itop-1,jtop+1,ktop-2) -8575.*utop(itop-1,jtop+1,ktop-1) -25725.*utop(itop-1,jtop+1,ktop) +1715.*utop(itop-1,jtop+1,ktop+1) -175.*utop(itop-1,jtop+2,ktop-2) +1225.*utop(itop-1,jtop+2,ktop-1) +3675.*utop(itop-1,jtop+2,ktop) -245.*utop(itop-1,jtop+2,ktop+1) +3675.*utop(itop,jtop-1,ktop-2) -25725.*utop(itop,jtop-1,ktop-1) -77175.*utop(itop,jtop-1,ktop) +5145.*utop(itop,jtop-1,ktop+1) -55125.*utop(itop,jtop,ktop-2) +385875.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) -77175.*utop(itop,jtop,ktop+1) -18375.*utop(itop,jtop+1,ktop-2) +128625.*utop(itop,jtop+1,ktop-1) +385875.*utop(itop,jtop+1,ktop) -25725.*utop(itop,jtop+1,ktop+1) +2625.*utop(itop,jtop+2,ktop-2) -18375.*utop(itop,jtop+2,ktop-1) -55125.*utop(itop,jtop+2,ktop) +3675.*utop(itop,jtop+2,ktop+1) +1225.*utop(itop+1,jtop-1,ktop-2) -8575.*utop(itop+1,jtop-1,ktop-1) -25725.*utop(itop+1,jtop-1,ktop) +1715.*utop(itop+1,jtop-1,ktop+1) -18375.*utop(itop+1,jtop,ktop-2) +128625.*utop(itop+1,jtop,ktop-1) +385875.*utop(itop+1,jtop,ktop) -25725.*utop(itop+1,jtop,ktop+1) -6125.*utop(itop+1,jtop+1,ktop-2) +42875.*utop(itop+1,jtop+1,ktop-1) +128625.*utop(itop+1,jtop+1,ktop) -8575.*utop(itop+1,jtop+1,ktop+1) +875.*utop(itop+1,jtop+2,ktop-2) -6125.*utop(itop+1,jtop+2,ktop-1) -18375.*utop(itop+1,jtop+2,ktop) +1225.*utop(itop+1,jtop+2,ktop+1) -175.*utop(itop+2,jtop-1,ktop-2) +1225.*utop(itop+2,jtop-1,ktop-1) +3675.*utop(itop+2,jtop-1,ktop) -245.*utop(itop+2,jtop-1,ktop+1) +2625.*utop(itop+2,jtop,ktop-2) -18375.*utop(itop+2,jtop,ktop-1) -55125.*utop(itop+2,jtop,ktop) +3675.*utop(itop+2,jtop,ktop+1) +875.*utop(itop+2,jtop+1,ktop-2) -6125.*utop(itop+2,jtop+1,ktop-1) -18375.*utop(itop+2,jtop+1,ktop) +1225.*utop(itop+2,jtop+1,ktop+1) -125.*utop(itop+2,jtop+2,ktop-2) +875.*utop(itop+2,jtop+2,ktop-1) +2625.*utop(itop+2,jtop+2,ktop) -175.*utop(itop+2,jtop+2,ktop+1) )/2097152.;
 	u(i+1,j+1,k+1) = ( -343.*utop(itop-1,jtop-1,ktop-1) +5145.*utop(itop-1,jtop-1,ktop) +1715.*utop(itop-1,jtop-1,ktop+1) -245.*utop(itop-1,jtop-1,ktop+2) +5145.*utop(itop-1,jtop,ktop-1) -77175.*utop(itop-1,jtop,ktop) -25725.*utop(itop-1,jtop,ktop+1) +3675.*utop(itop-1,jtop,ktop+2) +1715.*utop(itop-1,jtop+1,ktop-1) -25725.*utop(itop-1,jtop+1,ktop) -8575.*utop(itop-1,jtop+1,ktop+1) +1225.*utop(itop-1,jtop+1,ktop+2) -245.*utop(itop-1,jtop+2,ktop-1) +3675.*utop(itop-1,jtop+2,ktop) +1225.*utop(itop-1,jtop+2,ktop+1) -175.*utop(itop-1,jtop+2,ktop+2) +5145.*utop(itop,jtop-1,ktop-1) -77175.*utop(itop,jtop-1,ktop) -25725.*utop(itop,jtop-1,ktop+1) +3675.*utop(itop,jtop-1,ktop+2) -77175.*utop(itop,jtop,ktop-1) +1157625.*utop(itop,jtop,ktop) +385875.*utop(itop,jtop,ktop+1) -55125.*utop(itop,jtop,ktop+2) -25725.*utop(itop,jtop+1,ktop-1) +385875.*utop(itop,jtop+1,ktop) +128625.*utop(itop,jtop+1,ktop+1) -18375.*utop(itop,jtop+1,ktop+2) +3675.*utop(itop,jtop+2,ktop-1) -55125.*utop(itop,jtop+2,ktop) -18375.*utop(itop,jtop+2,ktop+1) +2625.*utop(itop,jtop+2,ktop+2) +1715.*utop(itop+1,jtop-1,ktop-1) -25725.*utop(itop+1,jtop-1,ktop) -8575.*utop(itop+1,jtop-1,ktop+1) +1225.*utop(itop+1,jtop-1,ktop+2) -25725.*utop(itop+1,jtop,ktop-1) +385875.*utop(itop+1,jtop,ktop) +128625.*utop(itop+1,jtop,ktop+1) -18375.*utop(itop+1,jtop,ktop+2) -8575.*utop(itop+1,jtop+1,ktop-1) +128625.*utop(itop+1,jtop+1,ktop) +42875.*utop(itop+1,jtop+1,ktop+1) -6125.*utop(itop+1,jtop+1,ktop+2) +1225.*utop(itop+1,jtop+2,ktop-1) -18375.*utop(itop+1,jtop+2,ktop) -6125.*utop(itop+1,jtop+2,ktop+1) +875.*utop(itop+1,jtop+2,ktop+2) -245.*utop(itop+2,jtop-1,ktop-1) +3675.*utop(itop+2,jtop-1,ktop) +1225.*utop(itop+2,jtop-1,ktop+1) -175.*utop(itop+2,jtop-1,ktop+2) +3675.*utop(itop+2,jtop,ktop-1) -55125.*utop(itop+2,jtop,ktop) -18375.*utop(itop+2,jtop,ktop+1) +2625.*utop(itop+2,jtop,ktop+2) +1225.*utop(itop+2,jtop+1,ktop-1) -18375.*utop(itop+2,jtop+1,ktop) -6125.*utop(itop+2,jtop+1,ktop+1) +875.*utop(itop+2,jtop+1,ktop+2) -175.*utop(itop+2,jtop+2,ktop-1) +2625.*utop(itop+2,jtop+2,ktop) +875.*utop(itop+2,jtop+2,ktop+1) -125.*utop(itop+2,jtop+2,ktop+2) )/2097152.;
 	*/
-	
-	u(i+0,j+0,k+0) = ( -1.060835e-05*utop(itop-2,jtop-2,ktop-2) +9.901123e-05*utop(itop-2,jtop-2,ktop-1) +4.455505e-04*utop(itop-2,jtop-2,ktop) -5.940674e-05*utop(itop-2,jtop-2,ktop+1) +8.250936e-06*utop(itop-2,jtop-2,ktop+2) +9.901123e-05*utop(itop-2,jtop-1,ktop-2) -9.241048e-04*utop(itop-2,jtop-1,ktop-1) -4.158472e-03*utop(itop-2,jtop-1,ktop) +5.544629e-04*utop(itop-2,jtop-1,ktop+1) -7.700874e-05*utop(itop-2,jtop-1,ktop+2) +4.455505e-04*utop(itop-2,jtop,ktop-2) -4.158472e-03*utop(itop-2,jtop,ktop-1) -1.871312e-02*utop(itop-2,jtop,ktop) +2.495083e-03*utop(itop-2,jtop,ktop+1) -3.465393e-04*utop(itop-2,jtop,ktop+2) -5.940674e-05*utop(itop-2,jtop+1,ktop-2) +5.544629e-04*utop(itop-2,jtop+1,ktop-1) +2.495083e-03*utop(itop-2,jtop+1,ktop) -3.326777e-04*utop(itop-2,jtop+1,ktop+1) +4.620524e-05*utop(itop-2,jtop+1,ktop+2) +8.250936e-06*utop(itop-2,jtop+2,ktop-2) -7.700874e-05*utop(itop-2,jtop+2,ktop-1) -3.465393e-04*utop(itop-2,jtop+2,ktop) +4.620524e-05*utop(itop-2,jtop+2,ktop+1) -6.417395e-06*utop(itop-2,jtop+2,ktop+2) +9.901123e-05*utop(itop-1,jtop-2,ktop-2) -9.241048e-04*utop(itop-1,jtop-2,ktop-1) -4.158472e-03*utop(itop-1,jtop-2,ktop) +5.544629e-04*utop(itop-1,jtop-2,ktop+1) -7.700874e-05*utop(itop-1,jtop-2,ktop+2) -9.241048e-04*utop(itop-1,jtop-1,ktop-2) +8.624978e-03*utop(itop-1,jtop-1,ktop-1) +3.881240e-02*utop(itop-1,jtop-1,ktop) -5.174987e-03*utop(itop-1,jtop-1,ktop+1) +7.187482e-04*utop(itop-1,jtop-1,ktop+2) -4.158472e-03*utop(itop-1,jtop,ktop-2) +3.881240e-02*utop(itop-1,jtop,ktop-1) +1.746558e-01*utop(itop-1,jtop,ktop) -2.328744e-02*utop(itop-1,jtop,ktop+1) +3.234367e-03*utop(itop-1,jtop,ktop+2) +5.544629e-04*utop(itop-1,jtop+1,ktop-2) -5.174987e-03*utop(itop-1,jtop+1,ktop-1) -2.328744e-02*utop(itop-1,jtop+1,ktop) +3.104992e-03*utop(itop-1,jtop+1,ktop+1) -4.312489e-04*utop(itop-1,jtop+1,ktop+2) -7.700874e-05*utop(itop-1,jtop+2,ktop-2) +7.187482e-04*utop(itop-1,jtop+2,ktop-1) +3.234367e-03*utop(itop-1,jtop+2,ktop) -4.312489e-04*utop(itop-1,jtop+2,ktop+1) +5.989568e-05*utop(itop-1,jtop+2,ktop+2) +4.455505e-04*utop(itop,jtop-2,ktop-2) -4.158472e-03*utop(itop,jtop-2,ktop-1) -1.871312e-02*utop(itop,jtop-2,ktop) +2.495083e-03*utop(itop,jtop-2,ktop+1) -3.465393e-04*utop(itop,jtop-2,ktop+2) -4.158472e-03*utop(itop,jtop-1,ktop-2) +3.881240e-02*utop(itop,jtop-1,ktop-1) +1.746558e-01*utop(itop,jtop-1,ktop) -2.328744e-02*utop(itop,jtop-1,ktop+1) +3.234367e-03*utop(itop,jtop-1,ktop+2) -1.871312e-02*utop(itop,jtop,ktop-2) +1.746558e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) -1.047935e-01*utop(itop,jtop,ktop+1) +1.455465e-02*utop(itop,jtop,ktop+2) +2.495083e-03*utop(itop,jtop+1,ktop-2) -2.328744e-02*utop(itop,jtop+1,ktop-1) -1.047935e-01*utop(itop,jtop+1,ktop) +1.397246e-02*utop(itop,jtop+1,ktop+1) -1.940620e-03*utop(itop,jtop+1,ktop+2) -3.465393e-04*utop(itop,jtop+2,ktop-2) +3.234367e-03*utop(itop,jtop+2,ktop-1) +1.455465e-02*utop(itop,jtop+2,ktop) -1.940620e-03*utop(itop,jtop+2,ktop+1) +2.695306e-04*utop(itop,jtop+2,ktop+2) -5.940674e-05*utop(itop+1,jtop-2,ktop-2) +5.544629e-04*utop(itop+1,jtop-2,ktop-1) +2.495083e-03*utop(itop+1,jtop-2,ktop) -3.326777e-04*utop(itop+1,jtop-2,ktop+1) +4.620524e-05*utop(itop+1,jtop-2,ktop+2) +5.544629e-04*utop(itop+1,jtop-1,ktop-2) -5.174987e-03*utop(itop+1,jtop-1,ktop-1) -2.328744e-02*utop(itop+1,jtop-1,ktop) +3.104992e-03*utop(itop+1,jtop-1,ktop+1) -4.312489e-04*utop(itop+1,jtop-1,ktop+2) +2.495083e-03*utop(itop+1,jtop,ktop-2) -2.328744e-02*utop(itop+1,jtop,ktop-1) -1.047935e-01*utop(itop+1,jtop,ktop) +1.397246e-02*utop(itop+1,jtop,ktop+1) -1.940620e-03*utop(itop+1,jtop,ktop+2) -3.326777e-04*utop(itop+1,jtop+1,ktop-2) +3.104992e-03*utop(itop+1,jtop+1,ktop-1) +1.397246e-02*utop(itop+1,jtop+1,ktop) -1.862995e-03*utop(itop+1,jtop+1,ktop+1) +2.587494e-04*utop(itop+1,jtop+1,ktop+2) +4.620524e-05*utop(itop+1,jtop+2,ktop-2) -4.312489e-04*utop(itop+1,jtop+2,ktop-1) -1.940620e-03*utop(itop+1,jtop+2,ktop) +2.587494e-04*utop(itop+1,jtop+2,ktop+1) -3.593741e-05*utop(itop+1,jtop+2,ktop+2) +8.250936e-06*utop(itop+2,jtop-2,ktop-2) -7.700874e-05*utop(itop+2,jtop-2,ktop-1) -3.465393e-04*utop(itop+2,jtop-2,ktop) +4.620524e-05*utop(itop+2,jtop-2,ktop+1) -6.417395e-06*utop(itop+2,jtop-2,ktop+2) -7.700874e-05*utop(itop+2,jtop-1,ktop-2) +7.187482e-04*utop(itop+2,jtop-1,ktop-1) +3.234367e-03*utop(itop+2,jtop-1,ktop) -4.312489e-04*utop(itop+2,jtop-1,ktop+1) +5.989568e-05*utop(itop+2,jtop-1,ktop+2) -3.465393e-04*utop(itop+2,jtop,ktop-2) +3.234367e-03*utop(itop+2,jtop,ktop-1) +1.455465e-02*utop(itop+2,jtop,ktop) -1.940620e-03*utop(itop+2,jtop,ktop+1) +2.695306e-04*utop(itop+2,jtop,ktop+2) +4.620524e-05*utop(itop+2,jtop+1,ktop-2) -4.312489e-04*utop(itop+2,jtop+1,ktop-1) -1.940620e-03*utop(itop+2,jtop+1,ktop) +2.587494e-04*utop(itop+2,jtop+1,ktop+1) -3.593741e-05*utop(itop+2,jtop+1,ktop+2) -6.417395e-06*utop(itop+2,jtop+2,ktop-2) +5.989568e-05*utop(itop+2,jtop+2,ktop-1) +2.695306e-04*utop(itop+2,jtop+2,ktop) -3.593741e-05*utop(itop+2,jtop+2,ktop+1) +4.991307e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+0,j+0,k+1) = ( +8.250936e-06*utop(itop-2,jtop-2,ktop-2) -5.940674e-05*utop(itop-2,jtop-2,ktop-1) +4.455505e-04*utop(itop-2,jtop-2,ktop) +9.901123e-05*utop(itop-2,jtop-2,ktop+1) -1.060835e-05*utop(itop-2,jtop-2,ktop+2) -7.700874e-05*utop(itop-2,jtop-1,ktop-2) +5.544629e-04*utop(itop-2,jtop-1,ktop-1) -4.158472e-03*utop(itop-2,jtop-1,ktop) -9.241048e-04*utop(itop-2,jtop-1,ktop+1) +9.901123e-05*utop(itop-2,jtop-1,ktop+2) -3.465393e-04*utop(itop-2,jtop,ktop-2) +2.495083e-03*utop(itop-2,jtop,ktop-1) -1.871312e-02*utop(itop-2,jtop,ktop) -4.158472e-03*utop(itop-2,jtop,ktop+1) +4.455505e-04*utop(itop-2,jtop,ktop+2) +4.620524e-05*utop(itop-2,jtop+1,ktop-2) -3.326777e-04*utop(itop-2,jtop+1,ktop-1) +2.495083e-03*utop(itop-2,jtop+1,ktop) +5.544629e-04*utop(itop-2,jtop+1,ktop+1) -5.940674e-05*utop(itop-2,jtop+1,ktop+2) -6.417395e-06*utop(itop-2,jtop+2,ktop-2) +4.620524e-05*utop(itop-2,jtop+2,ktop-1) -3.465393e-04*utop(itop-2,jtop+2,ktop) -7.700874e-05*utop(itop-2,jtop+2,ktop+1) +8.250936e-06*utop(itop-2,jtop+2,ktop+2) -7.700874e-05*utop(itop-1,jtop-2,ktop-2) +5.544629e-04*utop(itop-1,jtop-2,ktop-1) -4.158472e-03*utop(itop-1,jtop-2,ktop) -9.241048e-04*utop(itop-1,jtop-2,ktop+1) +9.901123e-05*utop(itop-1,jtop-2,ktop+2) +7.187482e-04*utop(itop-1,jtop-1,ktop-2) -5.174987e-03*utop(itop-1,jtop-1,ktop-1) +3.881240e-02*utop(itop-1,jtop-1,ktop) +8.624978e-03*utop(itop-1,jtop-1,ktop+1) -9.241048e-04*utop(itop-1,jtop-1,ktop+2) +3.234367e-03*utop(itop-1,jtop,ktop-2) -2.328744e-02*utop(itop-1,jtop,ktop-1) +1.746558e-01*utop(itop-1,jtop,ktop) +3.881240e-02*utop(itop-1,jtop,ktop+1) -4.158472e-03*utop(itop-1,jtop,ktop+2) -4.312489e-04*utop(itop-1,jtop+1,ktop-2) +3.104992e-03*utop(itop-1,jtop+1,ktop-1) -2.328744e-02*utop(itop-1,jtop+1,ktop) -5.174987e-03*utop(itop-1,jtop+1,ktop+1) +5.544629e-04*utop(itop-1,jtop+1,ktop+2) +5.989568e-05*utop(itop-1,jtop+2,ktop-2) -4.312489e-04*utop(itop-1,jtop+2,ktop-1) +3.234367e-03*utop(itop-1,jtop+2,ktop) +7.187482e-04*utop(itop-1,jtop+2,ktop+1) -7.700874e-05*utop(itop-1,jtop+2,ktop+2) -3.465393e-04*utop(itop,jtop-2,ktop-2) +2.495083e-03*utop(itop,jtop-2,ktop-1) -1.871312e-02*utop(itop,jtop-2,ktop) -4.158472e-03*utop(itop,jtop-2,ktop+1) +4.455505e-04*utop(itop,jtop-2,ktop+2) +3.234367e-03*utop(itop,jtop-1,ktop-2) -2.328744e-02*utop(itop,jtop-1,ktop-1) +1.746558e-01*utop(itop,jtop-1,ktop) +3.881240e-02*utop(itop,jtop-1,ktop+1) -4.158472e-03*utop(itop,jtop-1,ktop+2) +1.455465e-02*utop(itop,jtop,ktop-2) -1.047935e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) +1.746558e-01*utop(itop,jtop,ktop+1) -1.871312e-02*utop(itop,jtop,ktop+2) -1.940620e-03*utop(itop,jtop+1,ktop-2) +1.397246e-02*utop(itop,jtop+1,ktop-1) -1.047935e-01*utop(itop,jtop+1,ktop) -2.328744e-02*utop(itop,jtop+1,ktop+1) +2.495083e-03*utop(itop,jtop+1,ktop+2) +2.695306e-04*utop(itop,jtop+2,ktop-2) -1.940620e-03*utop(itop,jtop+2,ktop-1) +1.455465e-02*utop(itop,jtop+2,ktop) +3.234367e-03*utop(itop,jtop+2,ktop+1) -3.465393e-04*utop(itop,jtop+2,ktop+2) +4.620524e-05*utop(itop+1,jtop-2,ktop-2) -3.326777e-04*utop(itop+1,jtop-2,ktop-1) +2.495083e-03*utop(itop+1,jtop-2,ktop) +5.544629e-04*utop(itop+1,jtop-2,ktop+1) -5.940674e-05*utop(itop+1,jtop-2,ktop+2) -4.312489e-04*utop(itop+1,jtop-1,ktop-2) +3.104992e-03*utop(itop+1,jtop-1,ktop-1) -2.328744e-02*utop(itop+1,jtop-1,ktop) -5.174987e-03*utop(itop+1,jtop-1,ktop+1) +5.544629e-04*utop(itop+1,jtop-1,ktop+2) -1.940620e-03*utop(itop+1,jtop,ktop-2) +1.397246e-02*utop(itop+1,jtop,ktop-1) -1.047935e-01*utop(itop+1,jtop,ktop) -2.328744e-02*utop(itop+1,jtop,ktop+1) +2.495083e-03*utop(itop+1,jtop,ktop+2) +2.587494e-04*utop(itop+1,jtop+1,ktop-2) -1.862995e-03*utop(itop+1,jtop+1,ktop-1) +1.397246e-02*utop(itop+1,jtop+1,ktop) +3.104992e-03*utop(itop+1,jtop+1,ktop+1) -3.326777e-04*utop(itop+1,jtop+1,ktop+2) -3.593741e-05*utop(itop+1,jtop+2,ktop-2) +2.587494e-04*utop(itop+1,jtop+2,ktop-1) -1.940620e-03*utop(itop+1,jtop+2,ktop) -4.312489e-04*utop(itop+1,jtop+2,ktop+1) +4.620524e-05*utop(itop+1,jtop+2,ktop+2) -6.417395e-06*utop(itop+2,jtop-2,ktop-2) +4.620524e-05*utop(itop+2,jtop-2,ktop-1) -3.465393e-04*utop(itop+2,jtop-2,ktop) -7.700874e-05*utop(itop+2,jtop-2,ktop+1) +8.250936e-06*utop(itop+2,jtop-2,ktop+2) +5.989568e-05*utop(itop+2,jtop-1,ktop-2) -4.312489e-04*utop(itop+2,jtop-1,ktop-1) +3.234367e-03*utop(itop+2,jtop-1,ktop) +7.187482e-04*utop(itop+2,jtop-1,ktop+1) -7.700874e-05*utop(itop+2,jtop-1,ktop+2) +2.695306e-04*utop(itop+2,jtop,ktop-2) -1.940620e-03*utop(itop+2,jtop,ktop-1) +1.455465e-02*utop(itop+2,jtop,ktop) +3.234367e-03*utop(itop+2,jtop,ktop+1) -3.465393e-04*utop(itop+2,jtop,ktop+2) -3.593741e-05*utop(itop+2,jtop+1,ktop-2) +2.587494e-04*utop(itop+2,jtop+1,ktop-1) -1.940620e-03*utop(itop+2,jtop+1,ktop) -4.312489e-04*utop(itop+2,jtop+1,ktop+1) +4.620524e-05*utop(itop+2,jtop+1,ktop+2) +4.991307e-06*utop(itop+2,jtop+2,ktop-2) -3.593741e-05*utop(itop+2,jtop+2,ktop-1) +2.695306e-04*utop(itop+2,jtop+2,ktop) +5.989568e-05*utop(itop+2,jtop+2,ktop+1) -6.417395e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+0,j+1,k+0) = ( +8.250936e-06*utop(itop-2,jtop-2,ktop-2) -7.700874e-05*utop(itop-2,jtop-2,ktop-1) -3.465393e-04*utop(itop-2,jtop-2,ktop) +4.620524e-05*utop(itop-2,jtop-2,ktop+1) -6.417395e-06*utop(itop-2,jtop-2,ktop+2) -5.940674e-05*utop(itop-2,jtop-1,ktop-2) +5.544629e-04*utop(itop-2,jtop-1,ktop-1) +2.495083e-03*utop(itop-2,jtop-1,ktop) -3.326777e-04*utop(itop-2,jtop-1,ktop+1) +4.620524e-05*utop(itop-2,jtop-1,ktop+2) +4.455505e-04*utop(itop-2,jtop,ktop-2) -4.158472e-03*utop(itop-2,jtop,ktop-1) -1.871312e-02*utop(itop-2,jtop,ktop) +2.495083e-03*utop(itop-2,jtop,ktop+1) -3.465393e-04*utop(itop-2,jtop,ktop+2) +9.901123e-05*utop(itop-2,jtop+1,ktop-2) -9.241048e-04*utop(itop-2,jtop+1,ktop-1) -4.158472e-03*utop(itop-2,jtop+1,ktop) +5.544629e-04*utop(itop-2,jtop+1,ktop+1) -7.700874e-05*utop(itop-2,jtop+1,ktop+2) -1.060835e-05*utop(itop-2,jtop+2,ktop-2) +9.901123e-05*utop(itop-2,jtop+2,ktop-1) +4.455505e-04*utop(itop-2,jtop+2,ktop) -5.940674e-05*utop(itop-2,jtop+2,ktop+1) +8.250936e-06*utop(itop-2,jtop+2,ktop+2) -7.700874e-05*utop(itop-1,jtop-2,ktop-2) +7.187482e-04*utop(itop-1,jtop-2,ktop-1) +3.234367e-03*utop(itop-1,jtop-2,ktop) -4.312489e-04*utop(itop-1,jtop-2,ktop+1) +5.989568e-05*utop(itop-1,jtop-2,ktop+2) +5.544629e-04*utop(itop-1,jtop-1,ktop-2) -5.174987e-03*utop(itop-1,jtop-1,ktop-1) -2.328744e-02*utop(itop-1,jtop-1,ktop) +3.104992e-03*utop(itop-1,jtop-1,ktop+1) -4.312489e-04*utop(itop-1,jtop-1,ktop+2) -4.158472e-03*utop(itop-1,jtop,ktop-2) +3.881240e-02*utop(itop-1,jtop,ktop-1) +1.746558e-01*utop(itop-1,jtop,ktop) -2.328744e-02*utop(itop-1,jtop,ktop+1) +3.234367e-03*utop(itop-1,jtop,ktop+2) -9.241048e-04*utop(itop-1,jtop+1,ktop-2) +8.624978e-03*utop(itop-1,jtop+1,ktop-1) +3.881240e-02*utop(itop-1,jtop+1,ktop) -5.174987e-03*utop(itop-1,jtop+1,ktop+1) +7.187482e-04*utop(itop-1,jtop+1,ktop+2) +9.901123e-05*utop(itop-1,jtop+2,ktop-2) -9.241048e-04*utop(itop-1,jtop+2,ktop-1) -4.158472e-03*utop(itop-1,jtop+2,ktop) +5.544629e-04*utop(itop-1,jtop+2,ktop+1) -7.700874e-05*utop(itop-1,jtop+2,ktop+2) -3.465393e-04*utop(itop,jtop-2,ktop-2) +3.234367e-03*utop(itop,jtop-2,ktop-1) +1.455465e-02*utop(itop,jtop-2,ktop) -1.940620e-03*utop(itop,jtop-2,ktop+1) +2.695306e-04*utop(itop,jtop-2,ktop+2) +2.495083e-03*utop(itop,jtop-1,ktop-2) -2.328744e-02*utop(itop,jtop-1,ktop-1) -1.047935e-01*utop(itop,jtop-1,ktop) +1.397246e-02*utop(itop,jtop-1,ktop+1) -1.940620e-03*utop(itop,jtop-1,ktop+2) -1.871312e-02*utop(itop,jtop,ktop-2) +1.746558e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) -1.047935e-01*utop(itop,jtop,ktop+1) +1.455465e-02*utop(itop,jtop,ktop+2) -4.158472e-03*utop(itop,jtop+1,ktop-2) +3.881240e-02*utop(itop,jtop+1,ktop-1) +1.746558e-01*utop(itop,jtop+1,ktop) -2.328744e-02*utop(itop,jtop+1,ktop+1) +3.234367e-03*utop(itop,jtop+1,ktop+2) +4.455505e-04*utop(itop,jtop+2,ktop-2) -4.158472e-03*utop(itop,jtop+2,ktop-1) -1.871312e-02*utop(itop,jtop+2,ktop) +2.495083e-03*utop(itop,jtop+2,ktop+1) -3.465393e-04*utop(itop,jtop+2,ktop+2) +4.620524e-05*utop(itop+1,jtop-2,ktop-2) -4.312489e-04*utop(itop+1,jtop-2,ktop-1) -1.940620e-03*utop(itop+1,jtop-2,ktop) +2.587494e-04*utop(itop+1,jtop-2,ktop+1) -3.593741e-05*utop(itop+1,jtop-2,ktop+2) -3.326777e-04*utop(itop+1,jtop-1,ktop-2) +3.104992e-03*utop(itop+1,jtop-1,ktop-1) +1.397246e-02*utop(itop+1,jtop-1,ktop) -1.862995e-03*utop(itop+1,jtop-1,ktop+1) +2.587494e-04*utop(itop+1,jtop-1,ktop+2) +2.495083e-03*utop(itop+1,jtop,ktop-2) -2.328744e-02*utop(itop+1,jtop,ktop-1) -1.047935e-01*utop(itop+1,jtop,ktop) +1.397246e-02*utop(itop+1,jtop,ktop+1) -1.940620e-03*utop(itop+1,jtop,ktop+2) +5.544629e-04*utop(itop+1,jtop+1,ktop-2) -5.174987e-03*utop(itop+1,jtop+1,ktop-1) -2.328744e-02*utop(itop+1,jtop+1,ktop) +3.104992e-03*utop(itop+1,jtop+1,ktop+1) -4.312489e-04*utop(itop+1,jtop+1,ktop+2) -5.940674e-05*utop(itop+1,jtop+2,ktop-2) +5.544629e-04*utop(itop+1,jtop+2,ktop-1) +2.495083e-03*utop(itop+1,jtop+2,ktop) -3.326777e-04*utop(itop+1,jtop+2,ktop+1) +4.620524e-05*utop(itop+1,jtop+2,ktop+2) -6.417395e-06*utop(itop+2,jtop-2,ktop-2) +5.989568e-05*utop(itop+2,jtop-2,ktop-1) +2.695306e-04*utop(itop+2,jtop-2,ktop) -3.593741e-05*utop(itop+2,jtop-2,ktop+1) +4.991307e-06*utop(itop+2,jtop-2,ktop+2) +4.620524e-05*utop(itop+2,jtop-1,ktop-2) -4.312489e-04*utop(itop+2,jtop-1,ktop-1) -1.940620e-03*utop(itop+2,jtop-1,ktop) +2.587494e-04*utop(itop+2,jtop-1,ktop+1) -3.593741e-05*utop(itop+2,jtop-1,ktop+2) -3.465393e-04*utop(itop+2,jtop,ktop-2) +3.234367e-03*utop(itop+2,jtop,ktop-1) +1.455465e-02*utop(itop+2,jtop,ktop) -1.940620e-03*utop(itop+2,jtop,ktop+1) +2.695306e-04*utop(itop+2,jtop,ktop+2) -7.700874e-05*utop(itop+2,jtop+1,ktop-2) +7.187482e-04*utop(itop+2,jtop+1,ktop-1) +3.234367e-03*utop(itop+2,jtop+1,ktop) -4.312489e-04*utop(itop+2,jtop+1,ktop+1) +5.989568e-05*utop(itop+2,jtop+1,ktop+2) +8.250936e-06*utop(itop+2,jtop+2,ktop-2) -7.700874e-05*utop(itop+2,jtop+2,ktop-1) -3.465393e-04*utop(itop+2,jtop+2,ktop) +4.620524e-05*utop(itop+2,jtop+2,ktop+1) -6.417395e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+0,j+1,k+1) = ( -6.417395e-06*utop(itop-2,jtop-2,ktop-2) +4.620524e-05*utop(itop-2,jtop-2,ktop-1) -3.465393e-04*utop(itop-2,jtop-2,ktop) -7.700874e-05*utop(itop-2,jtop-2,ktop+1) +8.250936e-06*utop(itop-2,jtop-2,ktop+2) +4.620524e-05*utop(itop-2,jtop-1,ktop-2) -3.326777e-04*utop(itop-2,jtop-1,ktop-1) +2.495083e-03*utop(itop-2,jtop-1,ktop) +5.544629e-04*utop(itop-2,jtop-1,ktop+1) -5.940674e-05*utop(itop-2,jtop-1,ktop+2) -3.465393e-04*utop(itop-2,jtop,ktop-2) +2.495083e-03*utop(itop-2,jtop,ktop-1) -1.871312e-02*utop(itop-2,jtop,ktop) -4.158472e-03*utop(itop-2,jtop,ktop+1) +4.455505e-04*utop(itop-2,jtop,ktop+2) -7.700874e-05*utop(itop-2,jtop+1,ktop-2) +5.544629e-04*utop(itop-2,jtop+1,ktop-1) -4.158472e-03*utop(itop-2,jtop+1,ktop) -9.241048e-04*utop(itop-2,jtop+1,ktop+1) +9.901123e-05*utop(itop-2,jtop+1,ktop+2) +8.250936e-06*utop(itop-2,jtop+2,ktop-2) -5.940674e-05*utop(itop-2,jtop+2,ktop-1) +4.455505e-04*utop(itop-2,jtop+2,ktop) +9.901123e-05*utop(itop-2,jtop+2,ktop+1) -1.060835e-05*utop(itop-2,jtop+2,ktop+2) +5.989568e-05*utop(itop-1,jtop-2,ktop-2) -4.312489e-04*utop(itop-1,jtop-2,ktop-1) +3.234367e-03*utop(itop-1,jtop-2,ktop) +7.187482e-04*utop(itop-1,jtop-2,ktop+1) -7.700874e-05*utop(itop-1,jtop-2,ktop+2) -4.312489e-04*utop(itop-1,jtop-1,ktop-2) +3.104992e-03*utop(itop-1,jtop-1,ktop-1) -2.328744e-02*utop(itop-1,jtop-1,ktop) -5.174987e-03*utop(itop-1,jtop-1,ktop+1) +5.544629e-04*utop(itop-1,jtop-1,ktop+2) +3.234367e-03*utop(itop-1,jtop,ktop-2) -2.328744e-02*utop(itop-1,jtop,ktop-1) +1.746558e-01*utop(itop-1,jtop,ktop) +3.881240e-02*utop(itop-1,jtop,ktop+1) -4.158472e-03*utop(itop-1,jtop,ktop+2) +7.187482e-04*utop(itop-1,jtop+1,ktop-2) -5.174987e-03*utop(itop-1,jtop+1,ktop-1) +3.881240e-02*utop(itop-1,jtop+1,ktop) +8.624978e-03*utop(itop-1,jtop+1,ktop+1) -9.241048e-04*utop(itop-1,jtop+1,ktop+2) -7.700874e-05*utop(itop-1,jtop+2,ktop-2) +5.544629e-04*utop(itop-1,jtop+2,ktop-1) -4.158472e-03*utop(itop-1,jtop+2,ktop) -9.241048e-04*utop(itop-1,jtop+2,ktop+1) +9.901123e-05*utop(itop-1,jtop+2,ktop+2) +2.695306e-04*utop(itop,jtop-2,ktop-2) -1.940620e-03*utop(itop,jtop-2,ktop-1) +1.455465e-02*utop(itop,jtop-2,ktop) +3.234367e-03*utop(itop,jtop-2,ktop+1) -3.465393e-04*utop(itop,jtop-2,ktop+2) -1.940620e-03*utop(itop,jtop-1,ktop-2) +1.397246e-02*utop(itop,jtop-1,ktop-1) -1.047935e-01*utop(itop,jtop-1,ktop) -2.328744e-02*utop(itop,jtop-1,ktop+1) +2.495083e-03*utop(itop,jtop-1,ktop+2) +1.455465e-02*utop(itop,jtop,ktop-2) -1.047935e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) +1.746558e-01*utop(itop,jtop,ktop+1) -1.871312e-02*utop(itop,jtop,ktop+2) +3.234367e-03*utop(itop,jtop+1,ktop-2) -2.328744e-02*utop(itop,jtop+1,ktop-1) +1.746558e-01*utop(itop,jtop+1,ktop) +3.881240e-02*utop(itop,jtop+1,ktop+1) -4.158472e-03*utop(itop,jtop+1,ktop+2) -3.465393e-04*utop(itop,jtop+2,ktop-2) +2.495083e-03*utop(itop,jtop+2,ktop-1) -1.871312e-02*utop(itop,jtop+2,ktop) -4.158472e-03*utop(itop,jtop+2,ktop+1) +4.455505e-04*utop(itop,jtop+2,ktop+2) -3.593741e-05*utop(itop+1,jtop-2,ktop-2) +2.587494e-04*utop(itop+1,jtop-2,ktop-1) -1.940620e-03*utop(itop+1,jtop-2,ktop) -4.312489e-04*utop(itop+1,jtop-2,ktop+1) +4.620524e-05*utop(itop+1,jtop-2,ktop+2) +2.587494e-04*utop(itop+1,jtop-1,ktop-2) -1.862995e-03*utop(itop+1,jtop-1,ktop-1) +1.397246e-02*utop(itop+1,jtop-1,ktop) +3.104992e-03*utop(itop+1,jtop-1,ktop+1) -3.326777e-04*utop(itop+1,jtop-1,ktop+2) -1.940620e-03*utop(itop+1,jtop,ktop-2) +1.397246e-02*utop(itop+1,jtop,ktop-1) -1.047935e-01*utop(itop+1,jtop,ktop) -2.328744e-02*utop(itop+1,jtop,ktop+1) +2.495083e-03*utop(itop+1,jtop,ktop+2) -4.312489e-04*utop(itop+1,jtop+1,ktop-2) +3.104992e-03*utop(itop+1,jtop+1,ktop-1) -2.328744e-02*utop(itop+1,jtop+1,ktop) -5.174987e-03*utop(itop+1,jtop+1,ktop+1) +5.544629e-04*utop(itop+1,jtop+1,ktop+2) +4.620524e-05*utop(itop+1,jtop+2,ktop-2) -3.326777e-04*utop(itop+1,jtop+2,ktop-1) +2.495083e-03*utop(itop+1,jtop+2,ktop) +5.544629e-04*utop(itop+1,jtop+2,ktop+1) -5.940674e-05*utop(itop+1,jtop+2,ktop+2) +4.991307e-06*utop(itop+2,jtop-2,ktop-2) -3.593741e-05*utop(itop+2,jtop-2,ktop-1) +2.695306e-04*utop(itop+2,jtop-2,ktop) +5.989568e-05*utop(itop+2,jtop-2,ktop+1) -6.417395e-06*utop(itop+2,jtop-2,ktop+2) -3.593741e-05*utop(itop+2,jtop-1,ktop-2) +2.587494e-04*utop(itop+2,jtop-1,ktop-1) -1.940620e-03*utop(itop+2,jtop-1,ktop) -4.312489e-04*utop(itop+2,jtop-1,ktop+1) +4.620524e-05*utop(itop+2,jtop-1,ktop+2) +2.695306e-04*utop(itop+2,jtop,ktop-2) -1.940620e-03*utop(itop+2,jtop,ktop-1) +1.455465e-02*utop(itop+2,jtop,ktop) +3.234367e-03*utop(itop+2,jtop,ktop+1) -3.465393e-04*utop(itop+2,jtop,ktop+2) +5.989568e-05*utop(itop+2,jtop+1,ktop-2) -4.312489e-04*utop(itop+2,jtop+1,ktop-1) +3.234367e-03*utop(itop+2,jtop+1,ktop) +7.187482e-04*utop(itop+2,jtop+1,ktop+1) -7.700874e-05*utop(itop+2,jtop+1,ktop+2) -6.417395e-06*utop(itop+2,jtop+2,ktop-2) +4.620524e-05*utop(itop+2,jtop+2,ktop-1) -3.465393e-04*utop(itop+2,jtop+2,ktop) -7.700874e-05*utop(itop+2,jtop+2,ktop+1) +8.250936e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+1,j+0,k+0) = ( +8.250936e-06*utop(itop-2,jtop-2,ktop-2) -7.700874e-05*utop(itop-2,jtop-2,ktop-1) -3.465393e-04*utop(itop-2,jtop-2,ktop) +4.620524e-05*utop(itop-2,jtop-2,ktop+1) -6.417395e-06*utop(itop-2,jtop-2,ktop+2) -7.700874e-05*utop(itop-2,jtop-1,ktop-2) +7.187482e-04*utop(itop-2,jtop-1,ktop-1) +3.234367e-03*utop(itop-2,jtop-1,ktop) -4.312489e-04*utop(itop-2,jtop-1,ktop+1) +5.989568e-05*utop(itop-2,jtop-1,ktop+2) -3.465393e-04*utop(itop-2,jtop,ktop-2) +3.234367e-03*utop(itop-2,jtop,ktop-1) +1.455465e-02*utop(itop-2,jtop,ktop) -1.940620e-03*utop(itop-2,jtop,ktop+1) +2.695306e-04*utop(itop-2,jtop,ktop+2) +4.620524e-05*utop(itop-2,jtop+1,ktop-2) -4.312489e-04*utop(itop-2,jtop+1,ktop-1) -1.940620e-03*utop(itop-2,jtop+1,ktop) +2.587494e-04*utop(itop-2,jtop+1,ktop+1) -3.593741e-05*utop(itop-2,jtop+1,ktop+2) -6.417395e-06*utop(itop-2,jtop+2,ktop-2) +5.989568e-05*utop(itop-2,jtop+2,ktop-1) +2.695306e-04*utop(itop-2,jtop+2,ktop) -3.593741e-05*utop(itop-2,jtop+2,ktop+1) +4.991307e-06*utop(itop-2,jtop+2,ktop+2) -5.940674e-05*utop(itop-1,jtop-2,ktop-2) +5.544629e-04*utop(itop-1,jtop-2,ktop-1) +2.495083e-03*utop(itop-1,jtop-2,ktop) -3.326777e-04*utop(itop-1,jtop-2,ktop+1) +4.620524e-05*utop(itop-1,jtop-2,ktop+2) +5.544629e-04*utop(itop-1,jtop-1,ktop-2) -5.174987e-03*utop(itop-1,jtop-1,ktop-1) -2.328744e-02*utop(itop-1,jtop-1,ktop) +3.104992e-03*utop(itop-1,jtop-1,ktop+1) -4.312489e-04*utop(itop-1,jtop-1,ktop+2) +2.495083e-03*utop(itop-1,jtop,ktop-2) -2.328744e-02*utop(itop-1,jtop,ktop-1) -1.047935e-01*utop(itop-1,jtop,ktop) +1.397246e-02*utop(itop-1,jtop,ktop+1) -1.940620e-03*utop(itop-1,jtop,ktop+2) -3.326777e-04*utop(itop-1,jtop+1,ktop-2) +3.104992e-03*utop(itop-1,jtop+1,ktop-1) +1.397246e-02*utop(itop-1,jtop+1,ktop) -1.862995e-03*utop(itop-1,jtop+1,ktop+1) +2.587494e-04*utop(itop-1,jtop+1,ktop+2) +4.620524e-05*utop(itop-1,jtop+2,ktop-2) -4.312489e-04*utop(itop-1,jtop+2,ktop-1) -1.940620e-03*utop(itop-1,jtop+2,ktop) +2.587494e-04*utop(itop-1,jtop+2,ktop+1) -3.593741e-05*utop(itop-1,jtop+2,ktop+2) +4.455505e-04*utop(itop,jtop-2,ktop-2) -4.158472e-03*utop(itop,jtop-2,ktop-1) -1.871312e-02*utop(itop,jtop-2,ktop) +2.495083e-03*utop(itop,jtop-2,ktop+1) -3.465393e-04*utop(itop,jtop-2,ktop+2) -4.158472e-03*utop(itop,jtop-1,ktop-2) +3.881240e-02*utop(itop,jtop-1,ktop-1) +1.746558e-01*utop(itop,jtop-1,ktop) -2.328744e-02*utop(itop,jtop-1,ktop+1) +3.234367e-03*utop(itop,jtop-1,ktop+2) -1.871312e-02*utop(itop,jtop,ktop-2) +1.746558e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) -1.047935e-01*utop(itop,jtop,ktop+1) +1.455465e-02*utop(itop,jtop,ktop+2) +2.495083e-03*utop(itop,jtop+1,ktop-2) -2.328744e-02*utop(itop,jtop+1,ktop-1) -1.047935e-01*utop(itop,jtop+1,ktop) +1.397246e-02*utop(itop,jtop+1,ktop+1) -1.940620e-03*utop(itop,jtop+1,ktop+2) -3.465393e-04*utop(itop,jtop+2,ktop-2) +3.234367e-03*utop(itop,jtop+2,ktop-1) +1.455465e-02*utop(itop,jtop+2,ktop) -1.940620e-03*utop(itop,jtop+2,ktop+1) +2.695306e-04*utop(itop,jtop+2,ktop+2) +9.901123e-05*utop(itop+1,jtop-2,ktop-2) -9.241048e-04*utop(itop+1,jtop-2,ktop-1) -4.158472e-03*utop(itop+1,jtop-2,ktop) +5.544629e-04*utop(itop+1,jtop-2,ktop+1) -7.700874e-05*utop(itop+1,jtop-2,ktop+2) -9.241048e-04*utop(itop+1,jtop-1,ktop-2) +8.624978e-03*utop(itop+1,jtop-1,ktop-1) +3.881240e-02*utop(itop+1,jtop-1,ktop) -5.174987e-03*utop(itop+1,jtop-1,ktop+1) +7.187482e-04*utop(itop+1,jtop-1,ktop+2) -4.158472e-03*utop(itop+1,jtop,ktop-2) +3.881240e-02*utop(itop+1,jtop,ktop-1) +1.746558e-01*utop(itop+1,jtop,ktop) -2.328744e-02*utop(itop+1,jtop,ktop+1) +3.234367e-03*utop(itop+1,jtop,ktop+2) +5.544629e-04*utop(itop+1,jtop+1,ktop-2) -5.174987e-03*utop(itop+1,jtop+1,ktop-1) -2.328744e-02*utop(itop+1,jtop+1,ktop) +3.104992e-03*utop(itop+1,jtop+1,ktop+1) -4.312489e-04*utop(itop+1,jtop+1,ktop+2) -7.700874e-05*utop(itop+1,jtop+2,ktop-2) +7.187482e-04*utop(itop+1,jtop+2,ktop-1) +3.234367e-03*utop(itop+1,jtop+2,ktop) -4.312489e-04*utop(itop+1,jtop+2,ktop+1) +5.989568e-05*utop(itop+1,jtop+2,ktop+2) -1.060835e-05*utop(itop+2,jtop-2,ktop-2) +9.901123e-05*utop(itop+2,jtop-2,ktop-1) +4.455505e-04*utop(itop+2,jtop-2,ktop) -5.940674e-05*utop(itop+2,jtop-2,ktop+1) +8.250936e-06*utop(itop+2,jtop-2,ktop+2) +9.901123e-05*utop(itop+2,jtop-1,ktop-2) -9.241048e-04*utop(itop+2,jtop-1,ktop-1) -4.158472e-03*utop(itop+2,jtop-1,ktop) +5.544629e-04*utop(itop+2,jtop-1,ktop+1) -7.700874e-05*utop(itop+2,jtop-1,ktop+2) +4.455505e-04*utop(itop+2,jtop,ktop-2) -4.158472e-03*utop(itop+2,jtop,ktop-1) -1.871312e-02*utop(itop+2,jtop,ktop) +2.495083e-03*utop(itop+2,jtop,ktop+1) -3.465393e-04*utop(itop+2,jtop,ktop+2) -5.940674e-05*utop(itop+2,jtop+1,ktop-2) +5.544629e-04*utop(itop+2,jtop+1,ktop-1) +2.495083e-03*utop(itop+2,jtop+1,ktop) -3.326777e-04*utop(itop+2,jtop+1,ktop+1) +4.620524e-05*utop(itop+2,jtop+1,ktop+2) +8.250936e-06*utop(itop+2,jtop+2,ktop-2) -7.700874e-05*utop(itop+2,jtop+2,ktop-1) -3.465393e-04*utop(itop+2,jtop+2,ktop) +4.620524e-05*utop(itop+2,jtop+2,ktop+1) -6.417395e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+1,j+0,k+1) = ( -6.417395e-06*utop(itop-2,jtop-2,ktop-2) +4.620524e-05*utop(itop-2,jtop-2,ktop-1) -3.465393e-04*utop(itop-2,jtop-2,ktop) -7.700874e-05*utop(itop-2,jtop-2,ktop+1) +8.250936e-06*utop(itop-2,jtop-2,ktop+2) +5.989568e-05*utop(itop-2,jtop-1,ktop-2) -4.312489e-04*utop(itop-2,jtop-1,ktop-1) +3.234367e-03*utop(itop-2,jtop-1,ktop) +7.187482e-04*utop(itop-2,jtop-1,ktop+1) -7.700874e-05*utop(itop-2,jtop-1,ktop+2) +2.695306e-04*utop(itop-2,jtop,ktop-2) -1.940620e-03*utop(itop-2,jtop,ktop-1) +1.455465e-02*utop(itop-2,jtop,ktop) +3.234367e-03*utop(itop-2,jtop,ktop+1) -3.465393e-04*utop(itop-2,jtop,ktop+2) -3.593741e-05*utop(itop-2,jtop+1,ktop-2) +2.587494e-04*utop(itop-2,jtop+1,ktop-1) -1.940620e-03*utop(itop-2,jtop+1,ktop) -4.312489e-04*utop(itop-2,jtop+1,ktop+1) +4.620524e-05*utop(itop-2,jtop+1,ktop+2) +4.991307e-06*utop(itop-2,jtop+2,ktop-2) -3.593741e-05*utop(itop-2,jtop+2,ktop-1) +2.695306e-04*utop(itop-2,jtop+2,ktop) +5.989568e-05*utop(itop-2,jtop+2,ktop+1) -6.417395e-06*utop(itop-2,jtop+2,ktop+2) +4.620524e-05*utop(itop-1,jtop-2,ktop-2) -3.326777e-04*utop(itop-1,jtop-2,ktop-1) +2.495083e-03*utop(itop-1,jtop-2,ktop) +5.544629e-04*utop(itop-1,jtop-2,ktop+1) -5.940674e-05*utop(itop-1,jtop-2,ktop+2) -4.312489e-04*utop(itop-1,jtop-1,ktop-2) +3.104992e-03*utop(itop-1,jtop-1,ktop-1) -2.328744e-02*utop(itop-1,jtop-1,ktop) -5.174987e-03*utop(itop-1,jtop-1,ktop+1) +5.544629e-04*utop(itop-1,jtop-1,ktop+2) -1.940620e-03*utop(itop-1,jtop,ktop-2) +1.397246e-02*utop(itop-1,jtop,ktop-1) -1.047935e-01*utop(itop-1,jtop,ktop) -2.328744e-02*utop(itop-1,jtop,ktop+1) +2.495083e-03*utop(itop-1,jtop,ktop+2) +2.587494e-04*utop(itop-1,jtop+1,ktop-2) -1.862995e-03*utop(itop-1,jtop+1,ktop-1) +1.397246e-02*utop(itop-1,jtop+1,ktop) +3.104992e-03*utop(itop-1,jtop+1,ktop+1) -3.326777e-04*utop(itop-1,jtop+1,ktop+2) -3.593741e-05*utop(itop-1,jtop+2,ktop-2) +2.587494e-04*utop(itop-1,jtop+2,ktop-1) -1.940620e-03*utop(itop-1,jtop+2,ktop) -4.312489e-04*utop(itop-1,jtop+2,ktop+1) +4.620524e-05*utop(itop-1,jtop+2,ktop+2) -3.465393e-04*utop(itop,jtop-2,ktop-2) +2.495083e-03*utop(itop,jtop-2,ktop-1) -1.871312e-02*utop(itop,jtop-2,ktop) -4.158472e-03*utop(itop,jtop-2,ktop+1) +4.455505e-04*utop(itop,jtop-2,ktop+2) +3.234367e-03*utop(itop,jtop-1,ktop-2) -2.328744e-02*utop(itop,jtop-1,ktop-1) +1.746558e-01*utop(itop,jtop-1,ktop) +3.881240e-02*utop(itop,jtop-1,ktop+1) -4.158472e-03*utop(itop,jtop-1,ktop+2) +1.455465e-02*utop(itop,jtop,ktop-2) -1.047935e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) +1.746558e-01*utop(itop,jtop,ktop+1) -1.871312e-02*utop(itop,jtop,ktop+2) -1.940620e-03*utop(itop,jtop+1,ktop-2) +1.397246e-02*utop(itop,jtop+1,ktop-1) -1.047935e-01*utop(itop,jtop+1,ktop) -2.328744e-02*utop(itop,jtop+1,ktop+1) +2.495083e-03*utop(itop,jtop+1,ktop+2) +2.695306e-04*utop(itop,jtop+2,ktop-2) -1.940620e-03*utop(itop,jtop+2,ktop-1) +1.455465e-02*utop(itop,jtop+2,ktop) +3.234367e-03*utop(itop,jtop+2,ktop+1) -3.465393e-04*utop(itop,jtop+2,ktop+2) -7.700874e-05*utop(itop+1,jtop-2,ktop-2) +5.544629e-04*utop(itop+1,jtop-2,ktop-1) -4.158472e-03*utop(itop+1,jtop-2,ktop) -9.241048e-04*utop(itop+1,jtop-2,ktop+1) +9.901123e-05*utop(itop+1,jtop-2,ktop+2) +7.187482e-04*utop(itop+1,jtop-1,ktop-2) -5.174987e-03*utop(itop+1,jtop-1,ktop-1) +3.881240e-02*utop(itop+1,jtop-1,ktop) +8.624978e-03*utop(itop+1,jtop-1,ktop+1) -9.241048e-04*utop(itop+1,jtop-1,ktop+2) +3.234367e-03*utop(itop+1,jtop,ktop-2) -2.328744e-02*utop(itop+1,jtop,ktop-1) +1.746558e-01*utop(itop+1,jtop,ktop) +3.881240e-02*utop(itop+1,jtop,ktop+1) -4.158472e-03*utop(itop+1,jtop,ktop+2) -4.312489e-04*utop(itop+1,jtop+1,ktop-2) +3.104992e-03*utop(itop+1,jtop+1,ktop-1) -2.328744e-02*utop(itop+1,jtop+1,ktop) -5.174987e-03*utop(itop+1,jtop+1,ktop+1) +5.544629e-04*utop(itop+1,jtop+1,ktop+2) +5.989568e-05*utop(itop+1,jtop+2,ktop-2) -4.312489e-04*utop(itop+1,jtop+2,ktop-1) +3.234367e-03*utop(itop+1,jtop+2,ktop) +7.187482e-04*utop(itop+1,jtop+2,ktop+1) -7.700874e-05*utop(itop+1,jtop+2,ktop+2) +8.250936e-06*utop(itop+2,jtop-2,ktop-2) -5.940674e-05*utop(itop+2,jtop-2,ktop-1) +4.455505e-04*utop(itop+2,jtop-2,ktop) +9.901123e-05*utop(itop+2,jtop-2,ktop+1) -1.060835e-05*utop(itop+2,jtop-2,ktop+2) -7.700874e-05*utop(itop+2,jtop-1,ktop-2) +5.544629e-04*utop(itop+2,jtop-1,ktop-1) -4.158472e-03*utop(itop+2,jtop-1,ktop) -9.241048e-04*utop(itop+2,jtop-1,ktop+1) +9.901123e-05*utop(itop+2,jtop-1,ktop+2) -3.465393e-04*utop(itop+2,jtop,ktop-2) +2.495083e-03*utop(itop+2,jtop,ktop-1) -1.871312e-02*utop(itop+2,jtop,ktop) -4.158472e-03*utop(itop+2,jtop,ktop+1) +4.455505e-04*utop(itop+2,jtop,ktop+2) +4.620524e-05*utop(itop+2,jtop+1,ktop-2) -3.326777e-04*utop(itop+2,jtop+1,ktop-1) +2.495083e-03*utop(itop+2,jtop+1,ktop) +5.544629e-04*utop(itop+2,jtop+1,ktop+1) -5.940674e-05*utop(itop+2,jtop+1,ktop+2) -6.417395e-06*utop(itop+2,jtop+2,ktop-2) +4.620524e-05*utop(itop+2,jtop+2,ktop-1) -3.465393e-04*utop(itop+2,jtop+2,ktop) -7.700874e-05*utop(itop+2,jtop+2,ktop+1) +8.250936e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+1,j+1,k+0) = ( -6.417395e-06*utop(itop-2,jtop-2,ktop-2) +5.989568e-05*utop(itop-2,jtop-2,ktop-1) +2.695306e-04*utop(itop-2,jtop-2,ktop) -3.593741e-05*utop(itop-2,jtop-2,ktop+1) +4.991307e-06*utop(itop-2,jtop-2,ktop+2) +4.620524e-05*utop(itop-2,jtop-1,ktop-2) -4.312489e-04*utop(itop-2,jtop-1,ktop-1) -1.940620e-03*utop(itop-2,jtop-1,ktop) +2.587494e-04*utop(itop-2,jtop-1,ktop+1) -3.593741e-05*utop(itop-2,jtop-1,ktop+2) -3.465393e-04*utop(itop-2,jtop,ktop-2) +3.234367e-03*utop(itop-2,jtop,ktop-1) +1.455465e-02*utop(itop-2,jtop,ktop) -1.940620e-03*utop(itop-2,jtop,ktop+1) +2.695306e-04*utop(itop-2,jtop,ktop+2) -7.700874e-05*utop(itop-2,jtop+1,ktop-2) +7.187482e-04*utop(itop-2,jtop+1,ktop-1) +3.234367e-03*utop(itop-2,jtop+1,ktop) -4.312489e-04*utop(itop-2,jtop+1,ktop+1) +5.989568e-05*utop(itop-2,jtop+1,ktop+2) +8.250936e-06*utop(itop-2,jtop+2,ktop-2) -7.700874e-05*utop(itop-2,jtop+2,ktop-1) -3.465393e-04*utop(itop-2,jtop+2,ktop) +4.620524e-05*utop(itop-2,jtop+2,ktop+1) -6.417395e-06*utop(itop-2,jtop+2,ktop+2) +4.620524e-05*utop(itop-1,jtop-2,ktop-2) -4.312489e-04*utop(itop-1,jtop-2,ktop-1) -1.940620e-03*utop(itop-1,jtop-2,ktop) +2.587494e-04*utop(itop-1,jtop-2,ktop+1) -3.593741e-05*utop(itop-1,jtop-2,ktop+2) -3.326777e-04*utop(itop-1,jtop-1,ktop-2) +3.104992e-03*utop(itop-1,jtop-1,ktop-1) +1.397246e-02*utop(itop-1,jtop-1,ktop) -1.862995e-03*utop(itop-1,jtop-1,ktop+1) +2.587494e-04*utop(itop-1,jtop-1,ktop+2) +2.495083e-03*utop(itop-1,jtop,ktop-2) -2.328744e-02*utop(itop-1,jtop,ktop-1) -1.047935e-01*utop(itop-1,jtop,ktop) +1.397246e-02*utop(itop-1,jtop,ktop+1) -1.940620e-03*utop(itop-1,jtop,ktop+2) +5.544629e-04*utop(itop-1,jtop+1,ktop-2) -5.174987e-03*utop(itop-1,jtop+1,ktop-1) -2.328744e-02*utop(itop-1,jtop+1,ktop) +3.104992e-03*utop(itop-1,jtop+1,ktop+1) -4.312489e-04*utop(itop-1,jtop+1,ktop+2) -5.940674e-05*utop(itop-1,jtop+2,ktop-2) +5.544629e-04*utop(itop-1,jtop+2,ktop-1) +2.495083e-03*utop(itop-1,jtop+2,ktop) -3.326777e-04*utop(itop-1,jtop+2,ktop+1) +4.620524e-05*utop(itop-1,jtop+2,ktop+2) -3.465393e-04*utop(itop,jtop-2,ktop-2) +3.234367e-03*utop(itop,jtop-2,ktop-1) +1.455465e-02*utop(itop,jtop-2,ktop) -1.940620e-03*utop(itop,jtop-2,ktop+1) +2.695306e-04*utop(itop,jtop-2,ktop+2) +2.495083e-03*utop(itop,jtop-1,ktop-2) -2.328744e-02*utop(itop,jtop-1,ktop-1) -1.047935e-01*utop(itop,jtop-1,ktop) +1.397246e-02*utop(itop,jtop-1,ktop+1) -1.940620e-03*utop(itop,jtop-1,ktop+2) -1.871312e-02*utop(itop,jtop,ktop-2) +1.746558e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) -1.047935e-01*utop(itop,jtop,ktop+1) +1.455465e-02*utop(itop,jtop,ktop+2) -4.158472e-03*utop(itop,jtop+1,ktop-2) +3.881240e-02*utop(itop,jtop+1,ktop-1) +1.746558e-01*utop(itop,jtop+1,ktop) -2.328744e-02*utop(itop,jtop+1,ktop+1) +3.234367e-03*utop(itop,jtop+1,ktop+2) +4.455505e-04*utop(itop,jtop+2,ktop-2) -4.158472e-03*utop(itop,jtop+2,ktop-1) -1.871312e-02*utop(itop,jtop+2,ktop) +2.495083e-03*utop(itop,jtop+2,ktop+1) -3.465393e-04*utop(itop,jtop+2,ktop+2) -7.700874e-05*utop(itop+1,jtop-2,ktop-2) +7.187482e-04*utop(itop+1,jtop-2,ktop-1) +3.234367e-03*utop(itop+1,jtop-2,ktop) -4.312489e-04*utop(itop+1,jtop-2,ktop+1) +5.989568e-05*utop(itop+1,jtop-2,ktop+2) +5.544629e-04*utop(itop+1,jtop-1,ktop-2) -5.174987e-03*utop(itop+1,jtop-1,ktop-1) -2.328744e-02*utop(itop+1,jtop-1,ktop) +3.104992e-03*utop(itop+1,jtop-1,ktop+1) -4.312489e-04*utop(itop+1,jtop-1,ktop+2) -4.158472e-03*utop(itop+1,jtop,ktop-2) +3.881240e-02*utop(itop+1,jtop,ktop-1) +1.746558e-01*utop(itop+1,jtop,ktop) -2.328744e-02*utop(itop+1,jtop,ktop+1) +3.234367e-03*utop(itop+1,jtop,ktop+2) -9.241048e-04*utop(itop+1,jtop+1,ktop-2) +8.624978e-03*utop(itop+1,jtop+1,ktop-1) +3.881240e-02*utop(itop+1,jtop+1,ktop) -5.174987e-03*utop(itop+1,jtop+1,ktop+1) +7.187482e-04*utop(itop+1,jtop+1,ktop+2) +9.901123e-05*utop(itop+1,jtop+2,ktop-2) -9.241048e-04*utop(itop+1,jtop+2,ktop-1) -4.158472e-03*utop(itop+1,jtop+2,ktop) +5.544629e-04*utop(itop+1,jtop+2,ktop+1) -7.700874e-05*utop(itop+1,jtop+2,ktop+2) +8.250936e-06*utop(itop+2,jtop-2,ktop-2) -7.700874e-05*utop(itop+2,jtop-2,ktop-1) -3.465393e-04*utop(itop+2,jtop-2,ktop) +4.620524e-05*utop(itop+2,jtop-2,ktop+1) -6.417395e-06*utop(itop+2,jtop-2,ktop+2) -5.940674e-05*utop(itop+2,jtop-1,ktop-2) +5.544629e-04*utop(itop+2,jtop-1,ktop-1) +2.495083e-03*utop(itop+2,jtop-1,ktop) -3.326777e-04*utop(itop+2,jtop-1,ktop+1) +4.620524e-05*utop(itop+2,jtop-1,ktop+2) +4.455505e-04*utop(itop+2,jtop,ktop-2) -4.158472e-03*utop(itop+2,jtop,ktop-1) -1.871312e-02*utop(itop+2,jtop,ktop) +2.495083e-03*utop(itop+2,jtop,ktop+1) -3.465393e-04*utop(itop+2,jtop,ktop+2) +9.901123e-05*utop(itop+2,jtop+1,ktop-2) -9.241048e-04*utop(itop+2,jtop+1,ktop-1) -4.158472e-03*utop(itop+2,jtop+1,ktop) +5.544629e-04*utop(itop+2,jtop+1,ktop+1) -7.700874e-05*utop(itop+2,jtop+1,ktop+2) -1.060835e-05*utop(itop+2,jtop+2,ktop-2) +9.901123e-05*utop(itop+2,jtop+2,ktop-1) +4.455505e-04*utop(itop+2,jtop+2,ktop) -5.940674e-05*utop(itop+2,jtop+2,ktop+1) +8.250936e-06*utop(itop+2,jtop+2,ktop+2));
-	u(i+1,j+1,k+1) = ( +4.991307e-06*utop(itop-2,jtop-2,ktop-2) -3.593741e-05*utop(itop-2,jtop-2,ktop-1) +2.695306e-04*utop(itop-2,jtop-2,ktop) +5.989568e-05*utop(itop-2,jtop-2,ktop+1) -6.417395e-06*utop(itop-2,jtop-2,ktop+2) -3.593741e-05*utop(itop-2,jtop-1,ktop-2) +2.587494e-04*utop(itop-2,jtop-1,ktop-1) -1.940620e-03*utop(itop-2,jtop-1,ktop) -4.312489e-04*utop(itop-2,jtop-1,ktop+1) +4.620524e-05*utop(itop-2,jtop-1,ktop+2) +2.695306e-04*utop(itop-2,jtop,ktop-2) -1.940620e-03*utop(itop-2,jtop,ktop-1) +1.455465e-02*utop(itop-2,jtop,ktop) +3.234367e-03*utop(itop-2,jtop,ktop+1) -3.465393e-04*utop(itop-2,jtop,ktop+2) +5.989568e-05*utop(itop-2,jtop+1,ktop-2) -4.312489e-04*utop(itop-2,jtop+1,ktop-1) +3.234367e-03*utop(itop-2,jtop+1,ktop) +7.187482e-04*utop(itop-2,jtop+1,ktop+1) -7.700874e-05*utop(itop-2,jtop+1,ktop+2) -6.417395e-06*utop(itop-2,jtop+2,ktop-2) +4.620524e-05*utop(itop-2,jtop+2,ktop-1) -3.465393e-04*utop(itop-2,jtop+2,ktop) -7.700874e-05*utop(itop-2,jtop+2,ktop+1) +8.250936e-06*utop(itop-2,jtop+2,ktop+2) -3.593741e-05*utop(itop-1,jtop-2,ktop-2) +2.587494e-04*utop(itop-1,jtop-2,ktop-1) -1.940620e-03*utop(itop-1,jtop-2,ktop) -4.312489e-04*utop(itop-1,jtop-2,ktop+1) +4.620524e-05*utop(itop-1,jtop-2,ktop+2) +2.587494e-04*utop(itop-1,jtop-1,ktop-2) -1.862995e-03*utop(itop-1,jtop-1,ktop-1) +1.397246e-02*utop(itop-1,jtop-1,ktop) +3.104992e-03*utop(itop-1,jtop-1,ktop+1) -3.326777e-04*utop(itop-1,jtop-1,ktop+2) -1.940620e-03*utop(itop-1,jtop,ktop-2) +1.397246e-02*utop(itop-1,jtop,ktop-1) -1.047935e-01*utop(itop-1,jtop,ktop) -2.328744e-02*utop(itop-1,jtop,ktop+1) +2.495083e-03*utop(itop-1,jtop,ktop+2) -4.312489e-04*utop(itop-1,jtop+1,ktop-2) +3.104992e-03*utop(itop-1,jtop+1,ktop-1) -2.328744e-02*utop(itop-1,jtop+1,ktop) -5.174987e-03*utop(itop-1,jtop+1,ktop+1) +5.544629e-04*utop(itop-1,jtop+1,ktop+2) +4.620524e-05*utop(itop-1,jtop+2,ktop-2) -3.326777e-04*utop(itop-1,jtop+2,ktop-1) +2.495083e-03*utop(itop-1,jtop+2,ktop) +5.544629e-04*utop(itop-1,jtop+2,ktop+1) -5.940674e-05*utop(itop-1,jtop+2,ktop+2) +2.695306e-04*utop(itop,jtop-2,ktop-2) -1.940620e-03*utop(itop,jtop-2,ktop-1) +1.455465e-02*utop(itop,jtop-2,ktop) +3.234367e-03*utop(itop,jtop-2,ktop+1) -3.465393e-04*utop(itop,jtop-2,ktop+2) -1.940620e-03*utop(itop,jtop-1,ktop-2) +1.397246e-02*utop(itop,jtop-1,ktop-1) -1.047935e-01*utop(itop,jtop-1,ktop) -2.328744e-02*utop(itop,jtop-1,ktop+1) +2.495083e-03*utop(itop,jtop-1,ktop+2) +1.455465e-02*utop(itop,jtop,ktop-2) -1.047935e-01*utop(itop,jtop,ktop-1) +7.859512e-01*utop(itop,jtop,ktop) +1.746558e-01*utop(itop,jtop,ktop+1) -1.871312e-02*utop(itop,jtop,ktop+2) +3.234367e-03*utop(itop,jtop+1,ktop-2) -2.328744e-02*utop(itop,jtop+1,ktop-1) +1.746558e-01*utop(itop,jtop+1,ktop) +3.881240e-02*utop(itop,jtop+1,ktop+1) -4.158472e-03*utop(itop,jtop+1,ktop+2) -3.465393e-04*utop(itop,jtop+2,ktop-2) +2.495083e-03*utop(itop,jtop+2,ktop-1) -1.871312e-02*utop(itop,jtop+2,ktop) -4.158472e-03*utop(itop,jtop+2,ktop+1) +4.455505e-04*utop(itop,jtop+2,ktop+2) +5.989568e-05*utop(itop+1,jtop-2,ktop-2) -4.312489e-04*utop(itop+1,jtop-2,ktop-1) +3.234367e-03*utop(itop+1,jtop-2,ktop) +7.187482e-04*utop(itop+1,jtop-2,ktop+1) -7.700874e-05*utop(itop+1,jtop-2,ktop+2) -4.312489e-04*utop(itop+1,jtop-1,ktop-2) +3.104992e-03*utop(itop+1,jtop-1,ktop-1) -2.328744e-02*utop(itop+1,jtop-1,ktop) -5.174987e-03*utop(itop+1,jtop-1,ktop+1) +5.544629e-04*utop(itop+1,jtop-1,ktop+2) +3.234367e-03*utop(itop+1,jtop,ktop-2) -2.328744e-02*utop(itop+1,jtop,ktop-1) +1.746558e-01*utop(itop+1,jtop,ktop) +3.881240e-02*utop(itop+1,jtop,ktop+1) -4.158472e-03*utop(itop+1,jtop,ktop+2) +7.187482e-04*utop(itop+1,jtop+1,ktop-2) -5.174987e-03*utop(itop+1,jtop+1,ktop-1) +3.881240e-02*utop(itop+1,jtop+1,ktop) +8.624978e-03*utop(itop+1,jtop+1,ktop+1) -9.241048e-04*utop(itop+1,jtop+1,ktop+2) -7.700874e-05*utop(itop+1,jtop+2,ktop-2) +5.544629e-04*utop(itop+1,jtop+2,ktop-1) -4.158472e-03*utop(itop+1,jtop+2,ktop) -9.241048e-04*utop(itop+1,jtop+2,ktop+1) +9.901123e-05*utop(itop+1,jtop+2,ktop+2) -6.417395e-06*utop(itop+2,jtop-2,ktop-2) +4.620524e-05*utop(itop+2,jtop-2,ktop-1) -3.465393e-04*utop(itop+2,jtop-2,ktop) -7.700874e-05*utop(itop+2,jtop-2,ktop+1) +8.250936e-06*utop(itop+2,jtop-2,ktop+2) +4.620524e-05*utop(itop+2,jtop-1,ktop-2) -3.326777e-04*utop(itop+2,jtop-1,ktop-1) +2.495083e-03*utop(itop+2,jtop-1,ktop) +5.544629e-04*utop(itop+2,jtop-1,ktop+1) -5.940674e-05*utop(itop+2,jtop-1,ktop+2) -3.465393e-04*utop(itop+2,jtop,ktop-2) +2.495083e-03*utop(itop+2,jtop,ktop-1) -1.871312e-02*utop(itop+2,jtop,ktop) -4.158472e-03*utop(itop+2,jtop,ktop+1) +4.455505e-04*utop(itop+2,jtop,ktop+2) -7.700874e-05*utop(itop+2,jtop+1,ktop-2) +5.544629e-04*utop(itop+2,jtop+1,ktop-1) -4.158472e-03*utop(itop+2,jtop+1,ktop) -9.241048e-04*utop(itop+2,jtop+1,ktop+1) +9.901123e-05*utop(itop+2,jtop+1,ktop+2) +8.250936e-06*utop(itop+2,jtop+2,ktop-2) -5.940674e-05*utop(itop+2,jtop+2,ktop-1) +4.455505e-04*utop(itop+2,jtop+2,ktop) +9.901123e-05*utop(itop+2,jtop+2,ktop+1) -1.060835e-05*utop(itop+2,jtop+2,ktop+2));
-	
 
+	u(i + 0, j + 0, k + 0) = (-1.060835e-05 * utop(itop - 2, jtop - 2, ktop - 2) + 9.901123e-05 * utop(itop - 2, jtop - 2, ktop - 1) + 4.455505e-04 * utop(itop - 2, jtop - 2, ktop) - 5.940674e-05 * utop(itop - 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop - 2, ktop + 2) + 9.901123e-05 * utop(itop - 2, jtop - 1, ktop - 2) - 9.241048e-04 * utop(itop - 2, jtop - 1, ktop - 1) - 4.158472e-03 * utop(itop - 2, jtop - 1, ktop) + 5.544629e-04 * utop(itop - 2, jtop - 1, ktop + 1) - 7.700874e-05 * utop(itop - 2, jtop - 1, ktop + 2) + 4.455505e-04 * utop(itop - 2, jtop, ktop - 2) - 4.158472e-03 * utop(itop - 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop - 2, jtop, ktop) + 2.495083e-03 * utop(itop - 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop - 2, jtop, ktop + 2) - 5.940674e-05 * utop(itop - 2, jtop + 1, ktop - 2) + 5.544629e-04 * utop(itop - 2, jtop + 1, ktop - 1) + 2.495083e-03 * utop(itop - 2, jtop + 1, ktop) - 3.326777e-04 * utop(itop - 2, jtop + 1, ktop + 1) + 4.620524e-05 * utop(itop - 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop - 2) - 7.700874e-05 * utop(itop - 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop + 2, ktop) + 4.620524e-05 * utop(itop - 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop + 2) + 9.901123e-05 * utop(itop - 1, jtop - 2, ktop - 2) - 9.241048e-04 * utop(itop - 1, jtop - 2, ktop - 1) - 4.158472e-03 * utop(itop - 1, jtop - 2, ktop) + 5.544629e-04 * utop(itop - 1, jtop - 2, ktop + 1) - 7.700874e-05 * utop(itop - 1, jtop - 2, ktop + 2) - 9.241048e-04 * utop(itop - 1, jtop - 1, ktop - 2) + 8.624978e-03 * utop(itop - 1, jtop - 1, ktop - 1) + 3.881240e-02 * utop(itop - 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop + 1) + 7.187482e-04 * utop(itop - 1, jtop - 1, ktop + 2) - 4.158472e-03 * utop(itop - 1, jtop, ktop - 2) + 3.881240e-02 * utop(itop - 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop - 1, jtop, ktop) - 2.328744e-02 * utop(itop - 1, jtop, ktop + 1) + 3.234367e-03 * utop(itop - 1, jtop, ktop + 2) + 5.544629e-04 * utop(itop - 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop + 1) - 4.312489e-04 * utop(itop - 1, jtop + 1, ktop + 2) - 7.700874e-05 * utop(itop - 1, jtop + 2, ktop - 2) + 7.187482e-04 * utop(itop - 1, jtop + 2, ktop - 1) + 3.234367e-03 * utop(itop - 1, jtop + 2, ktop) - 4.312489e-04 * utop(itop - 1, jtop + 2, ktop + 1) + 5.989568e-05 * utop(itop - 1, jtop + 2, ktop + 2) + 4.455505e-04 * utop(itop, jtop - 2, ktop - 2) - 4.158472e-03 * utop(itop, jtop - 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop - 2, ktop) + 2.495083e-03 * utop(itop, jtop - 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop - 2, ktop + 2) - 4.158472e-03 * utop(itop, jtop - 1, ktop - 2) + 3.881240e-02 * utop(itop, jtop - 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop - 1, ktop) - 2.328744e-02 * utop(itop, jtop - 1, ktop + 1) + 3.234367e-03 * utop(itop, jtop - 1, ktop + 2) - 1.871312e-02 * utop(itop, jtop, ktop - 2) + 1.746558e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) - 1.047935e-01 * utop(itop, jtop, ktop + 1) + 1.455465e-02 * utop(itop, jtop, ktop + 2) + 2.495083e-03 * utop(itop, jtop + 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop + 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop + 1, ktop) + 1.397246e-02 * utop(itop, jtop + 1, ktop + 1) - 1.940620e-03 * utop(itop, jtop + 1, ktop + 2) - 3.465393e-04 * utop(itop, jtop + 2, ktop - 2) + 3.234367e-03 * utop(itop, jtop + 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop + 2, ktop) - 1.940620e-03 * utop(itop, jtop + 2, ktop + 1) + 2.695306e-04 * utop(itop, jtop + 2, ktop + 2) - 5.940674e-05 * utop(itop + 1, jtop - 2, ktop - 2) + 5.544629e-04 * utop(itop + 1, jtop - 2, ktop - 1) + 2.495083e-03 * utop(itop + 1, jtop - 2, ktop) - 3.326777e-04 * utop(itop + 1, jtop - 2, ktop + 1) + 4.620524e-05 * utop(itop + 1, jtop - 2, ktop + 2) + 5.544629e-04 * utop(itop + 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop + 1) - 4.312489e-04 * utop(itop + 1, jtop - 1, ktop + 2) + 2.495083e-03 * utop(itop + 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop + 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop + 1, jtop, ktop) + 1.397246e-02 * utop(itop + 1, jtop, ktop + 1) - 1.940620e-03 * utop(itop + 1, jtop, ktop + 2) - 3.326777e-04 * utop(itop + 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop - 1) + 1.397246e-02 * utop(itop + 1, jtop + 1, ktop) - 1.862995e-03 * utop(itop + 1, jtop + 1, ktop + 1) + 2.587494e-04 * utop(itop + 1, jtop + 1, ktop + 2) + 4.620524e-05 * utop(itop + 1, jtop + 2, ktop - 2) - 4.312489e-04 * utop(itop + 1, jtop + 2, ktop - 1) - 1.940620e-03 * utop(itop + 1, jtop + 2, ktop) + 2.587494e-04 * utop(itop + 1, jtop + 2, ktop + 1) - 3.593741e-05 * utop(itop + 1, jtop + 2, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop - 2) - 7.700874e-05 * utop(itop + 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop - 2, ktop) + 4.620524e-05 * utop(itop + 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop + 2) - 7.700874e-05 * utop(itop + 2, jtop - 1, ktop - 2) + 7.187482e-04 * utop(itop + 2, jtop - 1, ktop - 1) + 3.234367e-03 * utop(itop + 2, jtop - 1, ktop) - 4.312489e-04 * utop(itop + 2, jtop - 1, ktop + 1) + 5.989568e-05 * utop(itop + 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop + 2, jtop, ktop - 2) + 3.234367e-03 * utop(itop + 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop + 2, jtop, ktop) - 1.940620e-03 * utop(itop + 2, jtop, ktop + 1) + 2.695306e-04 * utop(itop + 2, jtop, ktop + 2) + 4.620524e-05 * utop(itop + 2, jtop + 1, ktop - 2) - 4.312489e-04 * utop(itop + 2, jtop + 1, ktop - 1) - 1.940620e-03 * utop(itop + 2, jtop + 1, ktop) + 2.587494e-04 * utop(itop + 2, jtop + 1, ktop + 1) - 3.593741e-05 * utop(itop + 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop - 2) + 5.989568e-05 * utop(itop + 2, jtop + 2, ktop - 1) + 2.695306e-04 * utop(itop + 2, jtop + 2, ktop) - 3.593741e-05 * utop(itop + 2, jtop + 2, ktop + 1) + 4.991307e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 0, j + 0, k + 1) = (+8.250936e-06 * utop(itop - 2, jtop - 2, ktop - 2) - 5.940674e-05 * utop(itop - 2, jtop - 2, ktop - 1) + 4.455505e-04 * utop(itop - 2, jtop - 2, ktop) + 9.901123e-05 * utop(itop - 2, jtop - 2, ktop + 1) - 1.060835e-05 * utop(itop - 2, jtop - 2, ktop + 2) - 7.700874e-05 * utop(itop - 2, jtop - 1, ktop - 2) + 5.544629e-04 * utop(itop - 2, jtop - 1, ktop - 1) - 4.158472e-03 * utop(itop - 2, jtop - 1, ktop) - 9.241048e-04 * utop(itop - 2, jtop - 1, ktop + 1) + 9.901123e-05 * utop(itop - 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop - 2, jtop, ktop - 2) + 2.495083e-03 * utop(itop - 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop - 2, jtop, ktop) - 4.158472e-03 * utop(itop - 2, jtop, ktop + 1) + 4.455505e-04 * utop(itop - 2, jtop, ktop + 2) + 4.620524e-05 * utop(itop - 2, jtop + 1, ktop - 2) - 3.326777e-04 * utop(itop - 2, jtop + 1, ktop - 1) + 2.495083e-03 * utop(itop - 2, jtop + 1, ktop) + 5.544629e-04 * utop(itop - 2, jtop + 1, ktop + 1) - 5.940674e-05 * utop(itop - 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop - 2) + 4.620524e-05 * utop(itop - 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop + 2, ktop) - 7.700874e-05 * utop(itop - 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop + 2) - 7.700874e-05 * utop(itop - 1, jtop - 2, ktop - 2) + 5.544629e-04 * utop(itop - 1, jtop - 2, ktop - 1) - 4.158472e-03 * utop(itop - 1, jtop - 2, ktop) - 9.241048e-04 * utop(itop - 1, jtop - 2, ktop + 1) + 9.901123e-05 * utop(itop - 1, jtop - 2, ktop + 2) + 7.187482e-04 * utop(itop - 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop - 1) + 3.881240e-02 * utop(itop - 1, jtop - 1, ktop) + 8.624978e-03 * utop(itop - 1, jtop - 1, ktop + 1) - 9.241048e-04 * utop(itop - 1, jtop - 1, ktop + 2) + 3.234367e-03 * utop(itop - 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop - 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop - 1, jtop, ktop) + 3.881240e-02 * utop(itop - 1, jtop, ktop + 1) - 4.158472e-03 * utop(itop - 1, jtop, ktop + 2) - 4.312489e-04 * utop(itop - 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop + 1) + 5.544629e-04 * utop(itop - 1, jtop + 1, ktop + 2) + 5.989568e-05 * utop(itop - 1, jtop + 2, ktop - 2) - 4.312489e-04 * utop(itop - 1, jtop + 2, ktop - 1) + 3.234367e-03 * utop(itop - 1, jtop + 2, ktop) + 7.187482e-04 * utop(itop - 1, jtop + 2, ktop + 1) - 7.700874e-05 * utop(itop - 1, jtop + 2, ktop + 2) - 3.465393e-04 * utop(itop, jtop - 2, ktop - 2) + 2.495083e-03 * utop(itop, jtop - 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop - 2, ktop) - 4.158472e-03 * utop(itop, jtop - 2, ktop + 1) + 4.455505e-04 * utop(itop, jtop - 2, ktop + 2) + 3.234367e-03 * utop(itop, jtop - 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop - 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop - 1, ktop) + 3.881240e-02 * utop(itop, jtop - 1, ktop + 1) - 4.158472e-03 * utop(itop, jtop - 1, ktop + 2) + 1.455465e-02 * utop(itop, jtop, ktop - 2) - 1.047935e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) + 1.746558e-01 * utop(itop, jtop, ktop + 1) - 1.871312e-02 * utop(itop, jtop, ktop + 2) - 1.940620e-03 * utop(itop, jtop + 1, ktop - 2) + 1.397246e-02 * utop(itop, jtop + 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop + 1, ktop) - 2.328744e-02 * utop(itop, jtop + 1, ktop + 1) + 2.495083e-03 * utop(itop, jtop + 1, ktop + 2) + 2.695306e-04 * utop(itop, jtop + 2, ktop - 2) - 1.940620e-03 * utop(itop, jtop + 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop + 2, ktop) + 3.234367e-03 * utop(itop, jtop + 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop + 2, ktop + 2) + 4.620524e-05 * utop(itop + 1, jtop - 2, ktop - 2) - 3.326777e-04 * utop(itop + 1, jtop - 2, ktop - 1) + 2.495083e-03 * utop(itop + 1, jtop - 2, ktop) + 5.544629e-04 * utop(itop + 1, jtop - 2, ktop + 1) - 5.940674e-05 * utop(itop + 1, jtop - 2, ktop + 2) - 4.312489e-04 * utop(itop + 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop + 1) + 5.544629e-04 * utop(itop + 1, jtop - 1, ktop + 2) - 1.940620e-03 * utop(itop + 1, jtop, ktop - 2) + 1.397246e-02 * utop(itop + 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop + 1, jtop, ktop) - 2.328744e-02 * utop(itop + 1, jtop, ktop + 1) + 2.495083e-03 * utop(itop + 1, jtop, ktop + 2) + 2.587494e-04 * utop(itop + 1, jtop + 1, ktop - 2) - 1.862995e-03 * utop(itop + 1, jtop + 1, ktop - 1) + 1.397246e-02 * utop(itop + 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop + 1) - 3.326777e-04 * utop(itop + 1, jtop + 1, ktop + 2) - 3.593741e-05 * utop(itop + 1, jtop + 2, ktop - 2) + 2.587494e-04 * utop(itop + 1, jtop + 2, ktop - 1) - 1.940620e-03 * utop(itop + 1, jtop + 2, ktop) - 4.312489e-04 * utop(itop + 1, jtop + 2, ktop + 1) + 4.620524e-05 * utop(itop + 1, jtop + 2, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop - 2) + 4.620524e-05 * utop(itop + 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop - 2, ktop) - 7.700874e-05 * utop(itop + 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop + 2) + 5.989568e-05 * utop(itop + 2, jtop - 1, ktop - 2) - 4.312489e-04 * utop(itop + 2, jtop - 1, ktop - 1) + 3.234367e-03 * utop(itop + 2, jtop - 1, ktop) + 7.187482e-04 * utop(itop + 2, jtop - 1, ktop + 1) - 7.700874e-05 * utop(itop + 2, jtop - 1, ktop + 2) + 2.695306e-04 * utop(itop + 2, jtop, ktop - 2) - 1.940620e-03 * utop(itop + 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop + 2, jtop, ktop) + 3.234367e-03 * utop(itop + 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop + 2, jtop, ktop + 2) - 3.593741e-05 * utop(itop + 2, jtop + 1, ktop - 2) + 2.587494e-04 * utop(itop + 2, jtop + 1, ktop - 1) - 1.940620e-03 * utop(itop + 2, jtop + 1, ktop) - 4.312489e-04 * utop(itop + 2, jtop + 1, ktop + 1) + 4.620524e-05 * utop(itop + 2, jtop + 1, ktop + 2) + 4.991307e-06 * utop(itop + 2, jtop + 2, ktop - 2) - 3.593741e-05 * utop(itop + 2, jtop + 2, ktop - 1) + 2.695306e-04 * utop(itop + 2, jtop + 2, ktop) + 5.989568e-05 * utop(itop + 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 0, j + 1, k + 0) = (+8.250936e-06 * utop(itop - 2, jtop - 2, ktop - 2) - 7.700874e-05 * utop(itop - 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop - 2, ktop) + 4.620524e-05 * utop(itop - 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop - 2, ktop + 2) - 5.940674e-05 * utop(itop - 2, jtop - 1, ktop - 2) + 5.544629e-04 * utop(itop - 2, jtop - 1, ktop - 1) + 2.495083e-03 * utop(itop - 2, jtop - 1, ktop) - 3.326777e-04 * utop(itop - 2, jtop - 1, ktop + 1) + 4.620524e-05 * utop(itop - 2, jtop - 1, ktop + 2) + 4.455505e-04 * utop(itop - 2, jtop, ktop - 2) - 4.158472e-03 * utop(itop - 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop - 2, jtop, ktop) + 2.495083e-03 * utop(itop - 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop - 2, jtop, ktop + 2) + 9.901123e-05 * utop(itop - 2, jtop + 1, ktop - 2) - 9.241048e-04 * utop(itop - 2, jtop + 1, ktop - 1) - 4.158472e-03 * utop(itop - 2, jtop + 1, ktop) + 5.544629e-04 * utop(itop - 2, jtop + 1, ktop + 1) - 7.700874e-05 * utop(itop - 2, jtop + 1, ktop + 2) - 1.060835e-05 * utop(itop - 2, jtop + 2, ktop - 2) + 9.901123e-05 * utop(itop - 2, jtop + 2, ktop - 1) + 4.455505e-04 * utop(itop - 2, jtop + 2, ktop) - 5.940674e-05 * utop(itop - 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop + 2) - 7.700874e-05 * utop(itop - 1, jtop - 2, ktop - 2) + 7.187482e-04 * utop(itop - 1, jtop - 2, ktop - 1) + 3.234367e-03 * utop(itop - 1, jtop - 2, ktop) - 4.312489e-04 * utop(itop - 1, jtop - 2, ktop + 1) + 5.989568e-05 * utop(itop - 1, jtop - 2, ktop + 2) + 5.544629e-04 * utop(itop - 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop + 1) - 4.312489e-04 * utop(itop - 1, jtop - 1, ktop + 2) - 4.158472e-03 * utop(itop - 1, jtop, ktop - 2) + 3.881240e-02 * utop(itop - 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop - 1, jtop, ktop) - 2.328744e-02 * utop(itop - 1, jtop, ktop + 1) + 3.234367e-03 * utop(itop - 1, jtop, ktop + 2) - 9.241048e-04 * utop(itop - 1, jtop + 1, ktop - 2) + 8.624978e-03 * utop(itop - 1, jtop + 1, ktop - 1) + 3.881240e-02 * utop(itop - 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop + 1) + 7.187482e-04 * utop(itop - 1, jtop + 1, ktop + 2) + 9.901123e-05 * utop(itop - 1, jtop + 2, ktop - 2) - 9.241048e-04 * utop(itop - 1, jtop + 2, ktop - 1) - 4.158472e-03 * utop(itop - 1, jtop + 2, ktop) + 5.544629e-04 * utop(itop - 1, jtop + 2, ktop + 1) - 7.700874e-05 * utop(itop - 1, jtop + 2, ktop + 2) - 3.465393e-04 * utop(itop, jtop - 2, ktop - 2) + 3.234367e-03 * utop(itop, jtop - 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop - 2, ktop) - 1.940620e-03 * utop(itop, jtop - 2, ktop + 1) + 2.695306e-04 * utop(itop, jtop - 2, ktop + 2) + 2.495083e-03 * utop(itop, jtop - 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop - 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop - 1, ktop) + 1.397246e-02 * utop(itop, jtop - 1, ktop + 1) - 1.940620e-03 * utop(itop, jtop - 1, ktop + 2) - 1.871312e-02 * utop(itop, jtop, ktop - 2) + 1.746558e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) - 1.047935e-01 * utop(itop, jtop, ktop + 1) + 1.455465e-02 * utop(itop, jtop, ktop + 2) - 4.158472e-03 * utop(itop, jtop + 1, ktop - 2) + 3.881240e-02 * utop(itop, jtop + 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop + 1, ktop) - 2.328744e-02 * utop(itop, jtop + 1, ktop + 1) + 3.234367e-03 * utop(itop, jtop + 1, ktop + 2) + 4.455505e-04 * utop(itop, jtop + 2, ktop - 2) - 4.158472e-03 * utop(itop, jtop + 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop + 2, ktop) + 2.495083e-03 * utop(itop, jtop + 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop + 2, ktop + 2) + 4.620524e-05 * utop(itop + 1, jtop - 2, ktop - 2) - 4.312489e-04 * utop(itop + 1, jtop - 2, ktop - 1) - 1.940620e-03 * utop(itop + 1, jtop - 2, ktop) + 2.587494e-04 * utop(itop + 1, jtop - 2, ktop + 1) - 3.593741e-05 * utop(itop + 1, jtop - 2, ktop + 2) - 3.326777e-04 * utop(itop + 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop - 1) + 1.397246e-02 * utop(itop + 1, jtop - 1, ktop) - 1.862995e-03 * utop(itop + 1, jtop - 1, ktop + 1) + 2.587494e-04 * utop(itop + 1, jtop - 1, ktop + 2) + 2.495083e-03 * utop(itop + 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop + 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop + 1, jtop, ktop) + 1.397246e-02 * utop(itop + 1, jtop, ktop + 1) - 1.940620e-03 * utop(itop + 1, jtop, ktop + 2) + 5.544629e-04 * utop(itop + 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop + 1) - 4.312489e-04 * utop(itop + 1, jtop + 1, ktop + 2) - 5.940674e-05 * utop(itop + 1, jtop + 2, ktop - 2) + 5.544629e-04 * utop(itop + 1, jtop + 2, ktop - 1) + 2.495083e-03 * utop(itop + 1, jtop + 2, ktop) - 3.326777e-04 * utop(itop + 1, jtop + 2, ktop + 1) + 4.620524e-05 * utop(itop + 1, jtop + 2, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop - 2) + 5.989568e-05 * utop(itop + 2, jtop - 2, ktop - 1) + 2.695306e-04 * utop(itop + 2, jtop - 2, ktop) - 3.593741e-05 * utop(itop + 2, jtop - 2, ktop + 1) + 4.991307e-06 * utop(itop + 2, jtop - 2, ktop + 2) + 4.620524e-05 * utop(itop + 2, jtop - 1, ktop - 2) - 4.312489e-04 * utop(itop + 2, jtop - 1, ktop - 1) - 1.940620e-03 * utop(itop + 2, jtop - 1, ktop) + 2.587494e-04 * utop(itop + 2, jtop - 1, ktop + 1) - 3.593741e-05 * utop(itop + 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop + 2, jtop, ktop - 2) + 3.234367e-03 * utop(itop + 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop + 2, jtop, ktop) - 1.940620e-03 * utop(itop + 2, jtop, ktop + 1) + 2.695306e-04 * utop(itop + 2, jtop, ktop + 2) - 7.700874e-05 * utop(itop + 2, jtop + 1, ktop - 2) + 7.187482e-04 * utop(itop + 2, jtop + 1, ktop - 1) + 3.234367e-03 * utop(itop + 2, jtop + 1, ktop) - 4.312489e-04 * utop(itop + 2, jtop + 1, ktop + 1) + 5.989568e-05 * utop(itop + 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop - 2) - 7.700874e-05 * utop(itop + 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop + 2, ktop) + 4.620524e-05 * utop(itop + 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 0, j + 1, k + 1) = (-6.417395e-06 * utop(itop - 2, jtop - 2, ktop - 2) + 4.620524e-05 * utop(itop - 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop - 2, ktop) - 7.700874e-05 * utop(itop - 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop - 2, ktop + 2) + 4.620524e-05 * utop(itop - 2, jtop - 1, ktop - 2) - 3.326777e-04 * utop(itop - 2, jtop - 1, ktop - 1) + 2.495083e-03 * utop(itop - 2, jtop - 1, ktop) + 5.544629e-04 * utop(itop - 2, jtop - 1, ktop + 1) - 5.940674e-05 * utop(itop - 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop - 2, jtop, ktop - 2) + 2.495083e-03 * utop(itop - 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop - 2, jtop, ktop) - 4.158472e-03 * utop(itop - 2, jtop, ktop + 1) + 4.455505e-04 * utop(itop - 2, jtop, ktop + 2) - 7.700874e-05 * utop(itop - 2, jtop + 1, ktop - 2) + 5.544629e-04 * utop(itop - 2, jtop + 1, ktop - 1) - 4.158472e-03 * utop(itop - 2, jtop + 1, ktop) - 9.241048e-04 * utop(itop - 2, jtop + 1, ktop + 1) + 9.901123e-05 * utop(itop - 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop - 2) - 5.940674e-05 * utop(itop - 2, jtop + 2, ktop - 1) + 4.455505e-04 * utop(itop - 2, jtop + 2, ktop) + 9.901123e-05 * utop(itop - 2, jtop + 2, ktop + 1) - 1.060835e-05 * utop(itop - 2, jtop + 2, ktop + 2) + 5.989568e-05 * utop(itop - 1, jtop - 2, ktop - 2) - 4.312489e-04 * utop(itop - 1, jtop - 2, ktop - 1) + 3.234367e-03 * utop(itop - 1, jtop - 2, ktop) + 7.187482e-04 * utop(itop - 1, jtop - 2, ktop + 1) - 7.700874e-05 * utop(itop - 1, jtop - 2, ktop + 2) - 4.312489e-04 * utop(itop - 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop + 1) + 5.544629e-04 * utop(itop - 1, jtop - 1, ktop + 2) + 3.234367e-03 * utop(itop - 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop - 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop - 1, jtop, ktop) + 3.881240e-02 * utop(itop - 1, jtop, ktop + 1) - 4.158472e-03 * utop(itop - 1, jtop, ktop + 2) + 7.187482e-04 * utop(itop - 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop - 1) + 3.881240e-02 * utop(itop - 1, jtop + 1, ktop) + 8.624978e-03 * utop(itop - 1, jtop + 1, ktop + 1) - 9.241048e-04 * utop(itop - 1, jtop + 1, ktop + 2) - 7.700874e-05 * utop(itop - 1, jtop + 2, ktop - 2) + 5.544629e-04 * utop(itop - 1, jtop + 2, ktop - 1) - 4.158472e-03 * utop(itop - 1, jtop + 2, ktop) - 9.241048e-04 * utop(itop - 1, jtop + 2, ktop + 1) + 9.901123e-05 * utop(itop - 1, jtop + 2, ktop + 2) + 2.695306e-04 * utop(itop, jtop - 2, ktop - 2) - 1.940620e-03 * utop(itop, jtop - 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop - 2, ktop) + 3.234367e-03 * utop(itop, jtop - 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop - 2, ktop + 2) - 1.940620e-03 * utop(itop, jtop - 1, ktop - 2) + 1.397246e-02 * utop(itop, jtop - 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop - 1, ktop) - 2.328744e-02 * utop(itop, jtop - 1, ktop + 1) + 2.495083e-03 * utop(itop, jtop - 1, ktop + 2) + 1.455465e-02 * utop(itop, jtop, ktop - 2) - 1.047935e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) + 1.746558e-01 * utop(itop, jtop, ktop + 1) - 1.871312e-02 * utop(itop, jtop, ktop + 2) + 3.234367e-03 * utop(itop, jtop + 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop + 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop + 1, ktop) + 3.881240e-02 * utop(itop, jtop + 1, ktop + 1) - 4.158472e-03 * utop(itop, jtop + 1, ktop + 2) - 3.465393e-04 * utop(itop, jtop + 2, ktop - 2) + 2.495083e-03 * utop(itop, jtop + 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop + 2, ktop) - 4.158472e-03 * utop(itop, jtop + 2, ktop + 1) + 4.455505e-04 * utop(itop, jtop + 2, ktop + 2) - 3.593741e-05 * utop(itop + 1, jtop - 2, ktop - 2) + 2.587494e-04 * utop(itop + 1, jtop - 2, ktop - 1) - 1.940620e-03 * utop(itop + 1, jtop - 2, ktop) - 4.312489e-04 * utop(itop + 1, jtop - 2, ktop + 1) + 4.620524e-05 * utop(itop + 1, jtop - 2, ktop + 2) + 2.587494e-04 * utop(itop + 1, jtop - 1, ktop - 2) - 1.862995e-03 * utop(itop + 1, jtop - 1, ktop - 1) + 1.397246e-02 * utop(itop + 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop + 1) - 3.326777e-04 * utop(itop + 1, jtop - 1, ktop + 2) - 1.940620e-03 * utop(itop + 1, jtop, ktop - 2) + 1.397246e-02 * utop(itop + 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop + 1, jtop, ktop) - 2.328744e-02 * utop(itop + 1, jtop, ktop + 1) + 2.495083e-03 * utop(itop + 1, jtop, ktop + 2) - 4.312489e-04 * utop(itop + 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop + 1) + 5.544629e-04 * utop(itop + 1, jtop + 1, ktop + 2) + 4.620524e-05 * utop(itop + 1, jtop + 2, ktop - 2) - 3.326777e-04 * utop(itop + 1, jtop + 2, ktop - 1) + 2.495083e-03 * utop(itop + 1, jtop + 2, ktop) + 5.544629e-04 * utop(itop + 1, jtop + 2, ktop + 1) - 5.940674e-05 * utop(itop + 1, jtop + 2, ktop + 2) + 4.991307e-06 * utop(itop + 2, jtop - 2, ktop - 2) - 3.593741e-05 * utop(itop + 2, jtop - 2, ktop - 1) + 2.695306e-04 * utop(itop + 2, jtop - 2, ktop) + 5.989568e-05 * utop(itop + 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop + 2) - 3.593741e-05 * utop(itop + 2, jtop - 1, ktop - 2) + 2.587494e-04 * utop(itop + 2, jtop - 1, ktop - 1) - 1.940620e-03 * utop(itop + 2, jtop - 1, ktop) - 4.312489e-04 * utop(itop + 2, jtop - 1, ktop + 1) + 4.620524e-05 * utop(itop + 2, jtop - 1, ktop + 2) + 2.695306e-04 * utop(itop + 2, jtop, ktop - 2) - 1.940620e-03 * utop(itop + 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop + 2, jtop, ktop) + 3.234367e-03 * utop(itop + 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop + 2, jtop, ktop + 2) + 5.989568e-05 * utop(itop + 2, jtop + 1, ktop - 2) - 4.312489e-04 * utop(itop + 2, jtop + 1, ktop - 1) + 3.234367e-03 * utop(itop + 2, jtop + 1, ktop) + 7.187482e-04 * utop(itop + 2, jtop + 1, ktop + 1) - 7.700874e-05 * utop(itop + 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop - 2) + 4.620524e-05 * utop(itop + 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop + 2, ktop) - 7.700874e-05 * utop(itop + 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 1, j + 0, k + 0) = (+8.250936e-06 * utop(itop - 2, jtop - 2, ktop - 2) - 7.700874e-05 * utop(itop - 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop - 2, ktop) + 4.620524e-05 * utop(itop - 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop - 2, ktop + 2) - 7.700874e-05 * utop(itop - 2, jtop - 1, ktop - 2) + 7.187482e-04 * utop(itop - 2, jtop - 1, ktop - 1) + 3.234367e-03 * utop(itop - 2, jtop - 1, ktop) - 4.312489e-04 * utop(itop - 2, jtop - 1, ktop + 1) + 5.989568e-05 * utop(itop - 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop - 2, jtop, ktop - 2) + 3.234367e-03 * utop(itop - 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop - 2, jtop, ktop) - 1.940620e-03 * utop(itop - 2, jtop, ktop + 1) + 2.695306e-04 * utop(itop - 2, jtop, ktop + 2) + 4.620524e-05 * utop(itop - 2, jtop + 1, ktop - 2) - 4.312489e-04 * utop(itop - 2, jtop + 1, ktop - 1) - 1.940620e-03 * utop(itop - 2, jtop + 1, ktop) + 2.587494e-04 * utop(itop - 2, jtop + 1, ktop + 1) - 3.593741e-05 * utop(itop - 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop - 2) + 5.989568e-05 * utop(itop - 2, jtop + 2, ktop - 1) + 2.695306e-04 * utop(itop - 2, jtop + 2, ktop) - 3.593741e-05 * utop(itop - 2, jtop + 2, ktop + 1) + 4.991307e-06 * utop(itop - 2, jtop + 2, ktop + 2) - 5.940674e-05 * utop(itop - 1, jtop - 2, ktop - 2) + 5.544629e-04 * utop(itop - 1, jtop - 2, ktop - 1) + 2.495083e-03 * utop(itop - 1, jtop - 2, ktop) - 3.326777e-04 * utop(itop - 1, jtop - 2, ktop + 1) + 4.620524e-05 * utop(itop - 1, jtop - 2, ktop + 2) + 5.544629e-04 * utop(itop - 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop + 1) - 4.312489e-04 * utop(itop - 1, jtop - 1, ktop + 2) + 2.495083e-03 * utop(itop - 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop - 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop - 1, jtop, ktop) + 1.397246e-02 * utop(itop - 1, jtop, ktop + 1) - 1.940620e-03 * utop(itop - 1, jtop, ktop + 2) - 3.326777e-04 * utop(itop - 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop - 1) + 1.397246e-02 * utop(itop - 1, jtop + 1, ktop) - 1.862995e-03 * utop(itop - 1, jtop + 1, ktop + 1) + 2.587494e-04 * utop(itop - 1, jtop + 1, ktop + 2) + 4.620524e-05 * utop(itop - 1, jtop + 2, ktop - 2) - 4.312489e-04 * utop(itop - 1, jtop + 2, ktop - 1) - 1.940620e-03 * utop(itop - 1, jtop + 2, ktop) + 2.587494e-04 * utop(itop - 1, jtop + 2, ktop + 1) - 3.593741e-05 * utop(itop - 1, jtop + 2, ktop + 2) + 4.455505e-04 * utop(itop, jtop - 2, ktop - 2) - 4.158472e-03 * utop(itop, jtop - 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop - 2, ktop) + 2.495083e-03 * utop(itop, jtop - 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop - 2, ktop + 2) - 4.158472e-03 * utop(itop, jtop - 1, ktop - 2) + 3.881240e-02 * utop(itop, jtop - 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop - 1, ktop) - 2.328744e-02 * utop(itop, jtop - 1, ktop + 1) + 3.234367e-03 * utop(itop, jtop - 1, ktop + 2) - 1.871312e-02 * utop(itop, jtop, ktop - 2) + 1.746558e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) - 1.047935e-01 * utop(itop, jtop, ktop + 1) + 1.455465e-02 * utop(itop, jtop, ktop + 2) + 2.495083e-03 * utop(itop, jtop + 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop + 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop + 1, ktop) + 1.397246e-02 * utop(itop, jtop + 1, ktop + 1) - 1.940620e-03 * utop(itop, jtop + 1, ktop + 2) - 3.465393e-04 * utop(itop, jtop + 2, ktop - 2) + 3.234367e-03 * utop(itop, jtop + 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop + 2, ktop) - 1.940620e-03 * utop(itop, jtop + 2, ktop + 1) + 2.695306e-04 * utop(itop, jtop + 2, ktop + 2) + 9.901123e-05 * utop(itop + 1, jtop - 2, ktop - 2) - 9.241048e-04 * utop(itop + 1, jtop - 2, ktop - 1) - 4.158472e-03 * utop(itop + 1, jtop - 2, ktop) + 5.544629e-04 * utop(itop + 1, jtop - 2, ktop + 1) - 7.700874e-05 * utop(itop + 1, jtop - 2, ktop + 2) - 9.241048e-04 * utop(itop + 1, jtop - 1, ktop - 2) + 8.624978e-03 * utop(itop + 1, jtop - 1, ktop - 1) + 3.881240e-02 * utop(itop + 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop + 1) + 7.187482e-04 * utop(itop + 1, jtop - 1, ktop + 2) - 4.158472e-03 * utop(itop + 1, jtop, ktop - 2) + 3.881240e-02 * utop(itop + 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop + 1, jtop, ktop) - 2.328744e-02 * utop(itop + 1, jtop, ktop + 1) + 3.234367e-03 * utop(itop + 1, jtop, ktop + 2) + 5.544629e-04 * utop(itop + 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop + 1) - 4.312489e-04 * utop(itop + 1, jtop + 1, ktop + 2) - 7.700874e-05 * utop(itop + 1, jtop + 2, ktop - 2) + 7.187482e-04 * utop(itop + 1, jtop + 2, ktop - 1) + 3.234367e-03 * utop(itop + 1, jtop + 2, ktop) - 4.312489e-04 * utop(itop + 1, jtop + 2, ktop + 1) + 5.989568e-05 * utop(itop + 1, jtop + 2, ktop + 2) - 1.060835e-05 * utop(itop + 2, jtop - 2, ktop - 2) + 9.901123e-05 * utop(itop + 2, jtop - 2, ktop - 1) + 4.455505e-04 * utop(itop + 2, jtop - 2, ktop) - 5.940674e-05 * utop(itop + 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop + 2) + 9.901123e-05 * utop(itop + 2, jtop - 1, ktop - 2) - 9.241048e-04 * utop(itop + 2, jtop - 1, ktop - 1) - 4.158472e-03 * utop(itop + 2, jtop - 1, ktop) + 5.544629e-04 * utop(itop + 2, jtop - 1, ktop + 1) - 7.700874e-05 * utop(itop + 2, jtop - 1, ktop + 2) + 4.455505e-04 * utop(itop + 2, jtop, ktop - 2) - 4.158472e-03 * utop(itop + 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop + 2, jtop, ktop) + 2.495083e-03 * utop(itop + 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop + 2, jtop, ktop + 2) - 5.940674e-05 * utop(itop + 2, jtop + 1, ktop - 2) + 5.544629e-04 * utop(itop + 2, jtop + 1, ktop - 1) + 2.495083e-03 * utop(itop + 2, jtop + 1, ktop) - 3.326777e-04 * utop(itop + 2, jtop + 1, ktop + 1) + 4.620524e-05 * utop(itop + 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop - 2) - 7.700874e-05 * utop(itop + 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop + 2, ktop) + 4.620524e-05 * utop(itop + 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 1, j + 0, k + 1) = (-6.417395e-06 * utop(itop - 2, jtop - 2, ktop - 2) + 4.620524e-05 * utop(itop - 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop - 2, ktop) - 7.700874e-05 * utop(itop - 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop - 2, ktop + 2) + 5.989568e-05 * utop(itop - 2, jtop - 1, ktop - 2) - 4.312489e-04 * utop(itop - 2, jtop - 1, ktop - 1) + 3.234367e-03 * utop(itop - 2, jtop - 1, ktop) + 7.187482e-04 * utop(itop - 2, jtop - 1, ktop + 1) - 7.700874e-05 * utop(itop - 2, jtop - 1, ktop + 2) + 2.695306e-04 * utop(itop - 2, jtop, ktop - 2) - 1.940620e-03 * utop(itop - 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop - 2, jtop, ktop) + 3.234367e-03 * utop(itop - 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop - 2, jtop, ktop + 2) - 3.593741e-05 * utop(itop - 2, jtop + 1, ktop - 2) + 2.587494e-04 * utop(itop - 2, jtop + 1, ktop - 1) - 1.940620e-03 * utop(itop - 2, jtop + 1, ktop) - 4.312489e-04 * utop(itop - 2, jtop + 1, ktop + 1) + 4.620524e-05 * utop(itop - 2, jtop + 1, ktop + 2) + 4.991307e-06 * utop(itop - 2, jtop + 2, ktop - 2) - 3.593741e-05 * utop(itop - 2, jtop + 2, ktop - 1) + 2.695306e-04 * utop(itop - 2, jtop + 2, ktop) + 5.989568e-05 * utop(itop - 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop + 2) + 4.620524e-05 * utop(itop - 1, jtop - 2, ktop - 2) - 3.326777e-04 * utop(itop - 1, jtop - 2, ktop - 1) + 2.495083e-03 * utop(itop - 1, jtop - 2, ktop) + 5.544629e-04 * utop(itop - 1, jtop - 2, ktop + 1) - 5.940674e-05 * utop(itop - 1, jtop - 2, ktop + 2) - 4.312489e-04 * utop(itop - 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop - 1, ktop + 1) + 5.544629e-04 * utop(itop - 1, jtop - 1, ktop + 2) - 1.940620e-03 * utop(itop - 1, jtop, ktop - 2) + 1.397246e-02 * utop(itop - 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop - 1, jtop, ktop) - 2.328744e-02 * utop(itop - 1, jtop, ktop + 1) + 2.495083e-03 * utop(itop - 1, jtop, ktop + 2) + 2.587494e-04 * utop(itop - 1, jtop + 1, ktop - 2) - 1.862995e-03 * utop(itop - 1, jtop + 1, ktop - 1) + 1.397246e-02 * utop(itop - 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop + 1) - 3.326777e-04 * utop(itop - 1, jtop + 1, ktop + 2) - 3.593741e-05 * utop(itop - 1, jtop + 2, ktop - 2) + 2.587494e-04 * utop(itop - 1, jtop + 2, ktop - 1) - 1.940620e-03 * utop(itop - 1, jtop + 2, ktop) - 4.312489e-04 * utop(itop - 1, jtop + 2, ktop + 1) + 4.620524e-05 * utop(itop - 1, jtop + 2, ktop + 2) - 3.465393e-04 * utop(itop, jtop - 2, ktop - 2) + 2.495083e-03 * utop(itop, jtop - 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop - 2, ktop) - 4.158472e-03 * utop(itop, jtop - 2, ktop + 1) + 4.455505e-04 * utop(itop, jtop - 2, ktop + 2) + 3.234367e-03 * utop(itop, jtop - 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop - 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop - 1, ktop) + 3.881240e-02 * utop(itop, jtop - 1, ktop + 1) - 4.158472e-03 * utop(itop, jtop - 1, ktop + 2) + 1.455465e-02 * utop(itop, jtop, ktop - 2) - 1.047935e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) + 1.746558e-01 * utop(itop, jtop, ktop + 1) - 1.871312e-02 * utop(itop, jtop, ktop + 2) - 1.940620e-03 * utop(itop, jtop + 1, ktop - 2) + 1.397246e-02 * utop(itop, jtop + 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop + 1, ktop) - 2.328744e-02 * utop(itop, jtop + 1, ktop + 1) + 2.495083e-03 * utop(itop, jtop + 1, ktop + 2) + 2.695306e-04 * utop(itop, jtop + 2, ktop - 2) - 1.940620e-03 * utop(itop, jtop + 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop + 2, ktop) + 3.234367e-03 * utop(itop, jtop + 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop + 2, ktop + 2) - 7.700874e-05 * utop(itop + 1, jtop - 2, ktop - 2) + 5.544629e-04 * utop(itop + 1, jtop - 2, ktop - 1) - 4.158472e-03 * utop(itop + 1, jtop - 2, ktop) - 9.241048e-04 * utop(itop + 1, jtop - 2, ktop + 1) + 9.901123e-05 * utop(itop + 1, jtop - 2, ktop + 2) + 7.187482e-04 * utop(itop + 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop - 1) + 3.881240e-02 * utop(itop + 1, jtop - 1, ktop) + 8.624978e-03 * utop(itop + 1, jtop - 1, ktop + 1) - 9.241048e-04 * utop(itop + 1, jtop - 1, ktop + 2) + 3.234367e-03 * utop(itop + 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop + 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop + 1, jtop, ktop) + 3.881240e-02 * utop(itop + 1, jtop, ktop + 1) - 4.158472e-03 * utop(itop + 1, jtop, ktop + 2) - 4.312489e-04 * utop(itop + 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop + 1) + 5.544629e-04 * utop(itop + 1, jtop + 1, ktop + 2) + 5.989568e-05 * utop(itop + 1, jtop + 2, ktop - 2) - 4.312489e-04 * utop(itop + 1, jtop + 2, ktop - 1) + 3.234367e-03 * utop(itop + 1, jtop + 2, ktop) + 7.187482e-04 * utop(itop + 1, jtop + 2, ktop + 1) - 7.700874e-05 * utop(itop + 1, jtop + 2, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop - 2) - 5.940674e-05 * utop(itop + 2, jtop - 2, ktop - 1) + 4.455505e-04 * utop(itop + 2, jtop - 2, ktop) + 9.901123e-05 * utop(itop + 2, jtop - 2, ktop + 1) - 1.060835e-05 * utop(itop + 2, jtop - 2, ktop + 2) - 7.700874e-05 * utop(itop + 2, jtop - 1, ktop - 2) + 5.544629e-04 * utop(itop + 2, jtop - 1, ktop - 1) - 4.158472e-03 * utop(itop + 2, jtop - 1, ktop) - 9.241048e-04 * utop(itop + 2, jtop - 1, ktop + 1) + 9.901123e-05 * utop(itop + 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop + 2, jtop, ktop - 2) + 2.495083e-03 * utop(itop + 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop + 2, jtop, ktop) - 4.158472e-03 * utop(itop + 2, jtop, ktop + 1) + 4.455505e-04 * utop(itop + 2, jtop, ktop + 2) + 4.620524e-05 * utop(itop + 2, jtop + 1, ktop - 2) - 3.326777e-04 * utop(itop + 2, jtop + 1, ktop - 1) + 2.495083e-03 * utop(itop + 2, jtop + 1, ktop) + 5.544629e-04 * utop(itop + 2, jtop + 1, ktop + 1) - 5.940674e-05 * utop(itop + 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop + 2, ktop - 2) + 4.620524e-05 * utop(itop + 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop + 2, ktop) - 7.700874e-05 * utop(itop + 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 1, j + 1, k + 0) = (-6.417395e-06 * utop(itop - 2, jtop - 2, ktop - 2) + 5.989568e-05 * utop(itop - 2, jtop - 2, ktop - 1) + 2.695306e-04 * utop(itop - 2, jtop - 2, ktop) - 3.593741e-05 * utop(itop - 2, jtop - 2, ktop + 1) + 4.991307e-06 * utop(itop - 2, jtop - 2, ktop + 2) + 4.620524e-05 * utop(itop - 2, jtop - 1, ktop - 2) - 4.312489e-04 * utop(itop - 2, jtop - 1, ktop - 1) - 1.940620e-03 * utop(itop - 2, jtop - 1, ktop) + 2.587494e-04 * utop(itop - 2, jtop - 1, ktop + 1) - 3.593741e-05 * utop(itop - 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop - 2, jtop, ktop - 2) + 3.234367e-03 * utop(itop - 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop - 2, jtop, ktop) - 1.940620e-03 * utop(itop - 2, jtop, ktop + 1) + 2.695306e-04 * utop(itop - 2, jtop, ktop + 2) - 7.700874e-05 * utop(itop - 2, jtop + 1, ktop - 2) + 7.187482e-04 * utop(itop - 2, jtop + 1, ktop - 1) + 3.234367e-03 * utop(itop - 2, jtop + 1, ktop) - 4.312489e-04 * utop(itop - 2, jtop + 1, ktop + 1) + 5.989568e-05 * utop(itop - 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop - 2) - 7.700874e-05 * utop(itop - 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop + 2, ktop) + 4.620524e-05 * utop(itop - 2, jtop + 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop + 2) + 4.620524e-05 * utop(itop - 1, jtop - 2, ktop - 2) - 4.312489e-04 * utop(itop - 1, jtop - 2, ktop - 1) - 1.940620e-03 * utop(itop - 1, jtop - 2, ktop) + 2.587494e-04 * utop(itop - 1, jtop - 2, ktop + 1) - 3.593741e-05 * utop(itop - 1, jtop - 2, ktop + 2) - 3.326777e-04 * utop(itop - 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop - 1) + 1.397246e-02 * utop(itop - 1, jtop - 1, ktop) - 1.862995e-03 * utop(itop - 1, jtop - 1, ktop + 1) + 2.587494e-04 * utop(itop - 1, jtop - 1, ktop + 2) + 2.495083e-03 * utop(itop - 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop - 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop - 1, jtop, ktop) + 1.397246e-02 * utop(itop - 1, jtop, ktop + 1) - 1.940620e-03 * utop(itop - 1, jtop, ktop + 2) + 5.544629e-04 * utop(itop - 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop + 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop + 1) - 4.312489e-04 * utop(itop - 1, jtop + 1, ktop + 2) - 5.940674e-05 * utop(itop - 1, jtop + 2, ktop - 2) + 5.544629e-04 * utop(itop - 1, jtop + 2, ktop - 1) + 2.495083e-03 * utop(itop - 1, jtop + 2, ktop) - 3.326777e-04 * utop(itop - 1, jtop + 2, ktop + 1) + 4.620524e-05 * utop(itop - 1, jtop + 2, ktop + 2) - 3.465393e-04 * utop(itop, jtop - 2, ktop - 2) + 3.234367e-03 * utop(itop, jtop - 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop - 2, ktop) - 1.940620e-03 * utop(itop, jtop - 2, ktop + 1) + 2.695306e-04 * utop(itop, jtop - 2, ktop + 2) + 2.495083e-03 * utop(itop, jtop - 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop - 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop - 1, ktop) + 1.397246e-02 * utop(itop, jtop - 1, ktop + 1) - 1.940620e-03 * utop(itop, jtop - 1, ktop + 2) - 1.871312e-02 * utop(itop, jtop, ktop - 2) + 1.746558e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) - 1.047935e-01 * utop(itop, jtop, ktop + 1) + 1.455465e-02 * utop(itop, jtop, ktop + 2) - 4.158472e-03 * utop(itop, jtop + 1, ktop - 2) + 3.881240e-02 * utop(itop, jtop + 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop + 1, ktop) - 2.328744e-02 * utop(itop, jtop + 1, ktop + 1) + 3.234367e-03 * utop(itop, jtop + 1, ktop + 2) + 4.455505e-04 * utop(itop, jtop + 2, ktop - 2) - 4.158472e-03 * utop(itop, jtop + 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop + 2, ktop) + 2.495083e-03 * utop(itop, jtop + 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop + 2, ktop + 2) - 7.700874e-05 * utop(itop + 1, jtop - 2, ktop - 2) + 7.187482e-04 * utop(itop + 1, jtop - 2, ktop - 1) + 3.234367e-03 * utop(itop + 1, jtop - 2, ktop) - 4.312489e-04 * utop(itop + 1, jtop - 2, ktop + 1) + 5.989568e-05 * utop(itop + 1, jtop - 2, ktop + 2) + 5.544629e-04 * utop(itop + 1, jtop - 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop + 1) - 4.312489e-04 * utop(itop + 1, jtop - 1, ktop + 2) - 4.158472e-03 * utop(itop + 1, jtop, ktop - 2) + 3.881240e-02 * utop(itop + 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop + 1, jtop, ktop) - 2.328744e-02 * utop(itop + 1, jtop, ktop + 1) + 3.234367e-03 * utop(itop + 1, jtop, ktop + 2) - 9.241048e-04 * utop(itop + 1, jtop + 1, ktop - 2) + 8.624978e-03 * utop(itop + 1, jtop + 1, ktop - 1) + 3.881240e-02 * utop(itop + 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop + 1) + 7.187482e-04 * utop(itop + 1, jtop + 1, ktop + 2) + 9.901123e-05 * utop(itop + 1, jtop + 2, ktop - 2) - 9.241048e-04 * utop(itop + 1, jtop + 2, ktop - 1) - 4.158472e-03 * utop(itop + 1, jtop + 2, ktop) + 5.544629e-04 * utop(itop + 1, jtop + 2, ktop + 1) - 7.700874e-05 * utop(itop + 1, jtop + 2, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop - 2) - 7.700874e-05 * utop(itop + 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop - 2, ktop) + 4.620524e-05 * utop(itop + 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop + 2) - 5.940674e-05 * utop(itop + 2, jtop - 1, ktop - 2) + 5.544629e-04 * utop(itop + 2, jtop - 1, ktop - 1) + 2.495083e-03 * utop(itop + 2, jtop - 1, ktop) - 3.326777e-04 * utop(itop + 2, jtop - 1, ktop + 1) + 4.620524e-05 * utop(itop + 2, jtop - 1, ktop + 2) + 4.455505e-04 * utop(itop + 2, jtop, ktop - 2) - 4.158472e-03 * utop(itop + 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop + 2, jtop, ktop) + 2.495083e-03 * utop(itop + 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop + 2, jtop, ktop + 2) + 9.901123e-05 * utop(itop + 2, jtop + 1, ktop - 2) - 9.241048e-04 * utop(itop + 2, jtop + 1, ktop - 1) - 4.158472e-03 * utop(itop + 2, jtop + 1, ktop) + 5.544629e-04 * utop(itop + 2, jtop + 1, ktop + 1) - 7.700874e-05 * utop(itop + 2, jtop + 1, ktop + 2) - 1.060835e-05 * utop(itop + 2, jtop + 2, ktop - 2) + 9.901123e-05 * utop(itop + 2, jtop + 2, ktop - 1) + 4.455505e-04 * utop(itop + 2, jtop + 2, ktop) - 5.940674e-05 * utop(itop + 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop + 2));
+	u(i + 1, j + 1, k + 1) = (+4.991307e-06 * utop(itop - 2, jtop - 2, ktop - 2) - 3.593741e-05 * utop(itop - 2, jtop - 2, ktop - 1) + 2.695306e-04 * utop(itop - 2, jtop - 2, ktop) + 5.989568e-05 * utop(itop - 2, jtop - 2, ktop + 1) - 6.417395e-06 * utop(itop - 2, jtop - 2, ktop + 2) - 3.593741e-05 * utop(itop - 2, jtop - 1, ktop - 2) + 2.587494e-04 * utop(itop - 2, jtop - 1, ktop - 1) - 1.940620e-03 * utop(itop - 2, jtop - 1, ktop) - 4.312489e-04 * utop(itop - 2, jtop - 1, ktop + 1) + 4.620524e-05 * utop(itop - 2, jtop - 1, ktop + 2) + 2.695306e-04 * utop(itop - 2, jtop, ktop - 2) - 1.940620e-03 * utop(itop - 2, jtop, ktop - 1) + 1.455465e-02 * utop(itop - 2, jtop, ktop) + 3.234367e-03 * utop(itop - 2, jtop, ktop + 1) - 3.465393e-04 * utop(itop - 2, jtop, ktop + 2) + 5.989568e-05 * utop(itop - 2, jtop + 1, ktop - 2) - 4.312489e-04 * utop(itop - 2, jtop + 1, ktop - 1) + 3.234367e-03 * utop(itop - 2, jtop + 1, ktop) + 7.187482e-04 * utop(itop - 2, jtop + 1, ktop + 1) - 7.700874e-05 * utop(itop - 2, jtop + 1, ktop + 2) - 6.417395e-06 * utop(itop - 2, jtop + 2, ktop - 2) + 4.620524e-05 * utop(itop - 2, jtop + 2, ktop - 1) - 3.465393e-04 * utop(itop - 2, jtop + 2, ktop) - 7.700874e-05 * utop(itop - 2, jtop + 2, ktop + 1) + 8.250936e-06 * utop(itop - 2, jtop + 2, ktop + 2) - 3.593741e-05 * utop(itop - 1, jtop - 2, ktop - 2) + 2.587494e-04 * utop(itop - 1, jtop - 2, ktop - 1) - 1.940620e-03 * utop(itop - 1, jtop - 2, ktop) - 4.312489e-04 * utop(itop - 1, jtop - 2, ktop + 1) + 4.620524e-05 * utop(itop - 1, jtop - 2, ktop + 2) + 2.587494e-04 * utop(itop - 1, jtop - 1, ktop - 2) - 1.862995e-03 * utop(itop - 1, jtop - 1, ktop - 1) + 1.397246e-02 * utop(itop - 1, jtop - 1, ktop) + 3.104992e-03 * utop(itop - 1, jtop - 1, ktop + 1) - 3.326777e-04 * utop(itop - 1, jtop - 1, ktop + 2) - 1.940620e-03 * utop(itop - 1, jtop, ktop - 2) + 1.397246e-02 * utop(itop - 1, jtop, ktop - 1) - 1.047935e-01 * utop(itop - 1, jtop, ktop) - 2.328744e-02 * utop(itop - 1, jtop, ktop + 1) + 2.495083e-03 * utop(itop - 1, jtop, ktop + 2) - 4.312489e-04 * utop(itop - 1, jtop + 1, ktop - 2) + 3.104992e-03 * utop(itop - 1, jtop + 1, ktop - 1) - 2.328744e-02 * utop(itop - 1, jtop + 1, ktop) - 5.174987e-03 * utop(itop - 1, jtop + 1, ktop + 1) + 5.544629e-04 * utop(itop - 1, jtop + 1, ktop + 2) + 4.620524e-05 * utop(itop - 1, jtop + 2, ktop - 2) - 3.326777e-04 * utop(itop - 1, jtop + 2, ktop - 1) + 2.495083e-03 * utop(itop - 1, jtop + 2, ktop) + 5.544629e-04 * utop(itop - 1, jtop + 2, ktop + 1) - 5.940674e-05 * utop(itop - 1, jtop + 2, ktop + 2) + 2.695306e-04 * utop(itop, jtop - 2, ktop - 2) - 1.940620e-03 * utop(itop, jtop - 2, ktop - 1) + 1.455465e-02 * utop(itop, jtop - 2, ktop) + 3.234367e-03 * utop(itop, jtop - 2, ktop + 1) - 3.465393e-04 * utop(itop, jtop - 2, ktop + 2) - 1.940620e-03 * utop(itop, jtop - 1, ktop - 2) + 1.397246e-02 * utop(itop, jtop - 1, ktop - 1) - 1.047935e-01 * utop(itop, jtop - 1, ktop) - 2.328744e-02 * utop(itop, jtop - 1, ktop + 1) + 2.495083e-03 * utop(itop, jtop - 1, ktop + 2) + 1.455465e-02 * utop(itop, jtop, ktop - 2) - 1.047935e-01 * utop(itop, jtop, ktop - 1) + 7.859512e-01 * utop(itop, jtop, ktop) + 1.746558e-01 * utop(itop, jtop, ktop + 1) - 1.871312e-02 * utop(itop, jtop, ktop + 2) + 3.234367e-03 * utop(itop, jtop + 1, ktop - 2) - 2.328744e-02 * utop(itop, jtop + 1, ktop - 1) + 1.746558e-01 * utop(itop, jtop + 1, ktop) + 3.881240e-02 * utop(itop, jtop + 1, ktop + 1) - 4.158472e-03 * utop(itop, jtop + 1, ktop + 2) - 3.465393e-04 * utop(itop, jtop + 2, ktop - 2) + 2.495083e-03 * utop(itop, jtop + 2, ktop - 1) - 1.871312e-02 * utop(itop, jtop + 2, ktop) - 4.158472e-03 * utop(itop, jtop + 2, ktop + 1) + 4.455505e-04 * utop(itop, jtop + 2, ktop + 2) + 5.989568e-05 * utop(itop + 1, jtop - 2, ktop - 2) - 4.312489e-04 * utop(itop + 1, jtop - 2, ktop - 1) + 3.234367e-03 * utop(itop + 1, jtop - 2, ktop) + 7.187482e-04 * utop(itop + 1, jtop - 2, ktop + 1) - 7.700874e-05 * utop(itop + 1, jtop - 2, ktop + 2) - 4.312489e-04 * utop(itop + 1, jtop - 1, ktop - 2) + 3.104992e-03 * utop(itop + 1, jtop - 1, ktop - 1) - 2.328744e-02 * utop(itop + 1, jtop - 1, ktop) - 5.174987e-03 * utop(itop + 1, jtop - 1, ktop + 1) + 5.544629e-04 * utop(itop + 1, jtop - 1, ktop + 2) + 3.234367e-03 * utop(itop + 1, jtop, ktop - 2) - 2.328744e-02 * utop(itop + 1, jtop, ktop - 1) + 1.746558e-01 * utop(itop + 1, jtop, ktop) + 3.881240e-02 * utop(itop + 1, jtop, ktop + 1) - 4.158472e-03 * utop(itop + 1, jtop, ktop + 2) + 7.187482e-04 * utop(itop + 1, jtop + 1, ktop - 2) - 5.174987e-03 * utop(itop + 1, jtop + 1, ktop - 1) + 3.881240e-02 * utop(itop + 1, jtop + 1, ktop) + 8.624978e-03 * utop(itop + 1, jtop + 1, ktop + 1) - 9.241048e-04 * utop(itop + 1, jtop + 1, ktop + 2) - 7.700874e-05 * utop(itop + 1, jtop + 2, ktop - 2) + 5.544629e-04 * utop(itop + 1, jtop + 2, ktop - 1) - 4.158472e-03 * utop(itop + 1, jtop + 2, ktop) - 9.241048e-04 * utop(itop + 1, jtop + 2, ktop + 1) + 9.901123e-05 * utop(itop + 1, jtop + 2, ktop + 2) - 6.417395e-06 * utop(itop + 2, jtop - 2, ktop - 2) + 4.620524e-05 * utop(itop + 2, jtop - 2, ktop - 1) - 3.465393e-04 * utop(itop + 2, jtop - 2, ktop) - 7.700874e-05 * utop(itop + 2, jtop - 2, ktop + 1) + 8.250936e-06 * utop(itop + 2, jtop - 2, ktop + 2) + 4.620524e-05 * utop(itop + 2, jtop - 1, ktop - 2) - 3.326777e-04 * utop(itop + 2, jtop - 1, ktop - 1) + 2.495083e-03 * utop(itop + 2, jtop - 1, ktop) + 5.544629e-04 * utop(itop + 2, jtop - 1, ktop + 1) - 5.940674e-05 * utop(itop + 2, jtop - 1, ktop + 2) - 3.465393e-04 * utop(itop + 2, jtop, ktop - 2) + 2.495083e-03 * utop(itop + 2, jtop, ktop - 1) - 1.871312e-02 * utop(itop + 2, jtop, ktop) - 4.158472e-03 * utop(itop + 2, jtop, ktop + 1) + 4.455505e-04 * utop(itop + 2, jtop, ktop + 2) - 7.700874e-05 * utop(itop + 2, jtop + 1, ktop - 2) + 5.544629e-04 * utop(itop + 2, jtop + 1, ktop - 1) - 4.158472e-03 * utop(itop + 2, jtop + 1, ktop) - 9.241048e-04 * utop(itop + 2, jtop + 1, ktop + 1) + 9.901123e-05 * utop(itop + 2, jtop + 1, ktop + 2) + 8.250936e-06 * utop(itop + 2, jtop + 2, ktop - 2) - 5.940674e-05 * utop(itop + 2, jtop + 2, ktop - 1) + 4.455505e-04 * utop(itop + 2, jtop + 2, ktop) + 9.901123e-05 * utop(itop + 2, jtop + 2, ktop + 1) - 1.060835e-05 * utop(itop + 2, jtop + 2, ktop + 2));
 }
 
-
-template< class S, class O, typename T >
-void solver<S,O,T>::interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, bool bcf=false )
+template <class S, class O>
+void solver<S, O>::interp_coarse_fine_cubic(unsigned ilevel, MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, bool bcf = false)
 {
-	
-	MeshvarBnd<T> *u    = &fine;
-	MeshvarBnd<T> *utop = &coarse;
-	
-	
+
+	MeshvarBnd<real_t> *u = &fine;
+	MeshvarBnd<real_t> *utop = &coarse;
+
 	bcf = true;
-	
+
 	int
-	xoff = u->offset(0),
-	yoff = u->offset(1),
-	zoff = u->offset(2);
-	
+			xoff = u->offset(0),
+			yoff = u->offset(1),
+			zoff = u->offset(2);
+
 	//... don't do anything if we are not an additional refinement region
-	if( ilevel <= m_ilevelmin )
+	if (ilevel <= m_ilevelmin)
 		return;
-	
+
 	int
-	nx = u->size(0), 
-	ny = u->size(1), 
-	nz = u->size(2);
-	
-	for( int j=0; j<ny; ++j )
-		for( int k=0; k<nz; ++k )
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
+	for (int j = 0; j < ny; ++j)
+		for (int k = 0; k < nz; ++k)
 		{
-			int jtop = (int)(0.5*(double)(j))+yoff;
-			int ktop = (int)(0.5*(double)(k))+zoff;
-			
-			interp_cubic( coarse, fine, -2, j, k, xoff-1,    jtop, ktop );
-			interp_cubic( coarse, fine, nz, j, k, xoff+nz/2, jtop, ktop );
-			
+			int jtop = (int)(0.5 * (double)(j)) + yoff;
+			int ktop = (int)(0.5 * (double)(k)) + zoff;
+
+			interp_cubic(coarse, fine, -2, j, k, xoff - 1, jtop, ktop);
+			interp_cubic(coarse, fine, nz, j, k, xoff + nz / 2, jtop, ktop);
 		}
-	
-	for( int i=0; i<nx; ++i )
-		for( int k=0; k<nz; ++k )
+
+	for (int i = 0; i < nx; ++i)
+		for (int k = 0; k < nz; ++k)
 		{
-			int itop = (int)(0.5*(double)(i))+xoff;
-			int ktop = (int)(0.5*(double)(k))+zoff;
-			
-			interp_cubic( coarse, fine, i, -2, k, itop, yoff-1,    ktop );
-			interp_cubic( coarse, fine, i, ny, k, itop, yoff+ny/2, ktop );
-			
+			int itop = (int)(0.5 * (double)(i)) + xoff;
+			int ktop = (int)(0.5 * (double)(k)) + zoff;
+
+			interp_cubic(coarse, fine, i, -2, k, itop, yoff - 1, ktop);
+			interp_cubic(coarse, fine, i, ny, k, itop, yoff + ny / 2, ktop);
 		}
-	
-	for( int i=0; i<nx; ++i )
-		for( int j=0; j<ny; ++j )
+
+	for (int i = 0; i < nx; ++i)
+		for (int j = 0; j < ny; ++j)
 		{
-			int itop = (int)(0.5*(double)(i))+xoff;
-			int jtop = (int)(0.5*(double)(j))+yoff;
-			
-			interp_cubic( coarse, fine, i, j, -2, itop, jtop, zoff-1 );
-			interp_cubic( coarse, fine, i, j, nz, itop, jtop, zoff+nz/2 );
-			
+			int itop = (int)(0.5 * (double)(i)) + xoff;
+			int jtop = (int)(0.5 * (double)(j)) + yoff;
+
+			interp_cubic(coarse, fine, i, j, -2, itop, jtop, zoff - 1);
+			interp_cubic(coarse, fine, i, j, nz, itop, jtop, zoff + nz / 2);
 		}
 }
 
-
-template< class S, class O, typename T >
-void solver<S,O,T>::interp_coarse_fine( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine, bool bcf )
+template <class S, class O>
+void solver<S, O>::interp_coarse_fine(unsigned ilevel, MeshvarBnd<real_t> &coarse, MeshvarBnd<real_t> &fine, bool bcf)
 {
-	MeshvarBnd<T> *u    = &fine;
-	MeshvarBnd<T> *utop = &coarse;
-	
-	
-	bcf = true;;
-	//bcf = false;
-	
+	MeshvarBnd<real_t> *u = &fine;
+	MeshvarBnd<real_t> *utop = &coarse;
+
+	bcf = true;
+	;
+	// bcf = false;
+
 	int
-		xoff = u->offset(0),
-		yoff = u->offset(1),
-		zoff = u->offset(2);
+			xoff = u->offset(0),
+			yoff = u->offset(1),
+			zoff = u->offset(2);
 
 	//... don't do anything if we are not an additional refinement region
-	if( xoff == 0 && yoff == 0 && zoff == 0 )
+	if (xoff == 0 && yoff == 0 && zoff == 0)
 		return;
-	
+
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-	
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
+
 	//... set boundary condition for fine grid
-	
-	#pragma omp parallel for schedule(dynamic)
-	for( int ix=-1; ix<=nx; ++ix )
-		for( int iy=-1; iy<=ny; ++iy )
-			for( int iz=-1; iz<=nz; ++iz )
+
+#pragma omp parallel for schedule(dynamic)
+	for (int ix = -1; ix <= nx; ++ix)
+		for (int iy = -1; iy <= ny; ++iy)
+			for (int iz = -1; iz <= nz; ++iz)
 			{
-				bool xbnd=(ix==-1||ix==nx),ybnd=(iy==-1||iy==ny),zbnd=(iz==-1||iz==nz);
-				
-				//if(ix==-1||ix==nx||iy==-1||iy==ny||iz==-1||iz==nz)
-				if( xbnd || ybnd || zbnd )
-				//if( xbnd ^ ybnd ^ zbnd )
+				bool xbnd = (ix == -1 || ix == nx), ybnd = (iy == -1 || iy == ny), zbnd = (iz == -1 || iz == nz);
+
+				// if(ix==-1||ix==nx||iy==-1||iy==ny||iz==-1||iz==nz)
+				if (xbnd || ybnd || zbnd)
+				// if( xbnd ^ ybnd ^ zbnd )
 				{
-					
+
 					//... only deal with proper ghostzones
-					if( (xbnd&&ybnd) || (xbnd&&zbnd) || (ybnd&&zbnd) || (xbnd&&ybnd&&zbnd))
+					if ((xbnd && ybnd) || (xbnd && zbnd) || (ybnd && zbnd) || (xbnd && ybnd && zbnd))
 						continue;
-					
+
 					/*int ixtop = (int)(0.5*(double)(ix+2*xoff)+1e-3);
 					int iytop = (int)(0.5*(double)(iy+2*yoff)+1e-3);
 					int iztop = (int)(0.5*(double)(iz+2*zoff)+1e-3);*/
-					
-					int ixtop = (int)(0.5*(double)(ix))+xoff;
-					int iytop = (int)(0.5*(double)(iy))+yoff;
-					int iztop = (int)(0.5*(double)(iz))+zoff;
-					
-					if( ix==-1 ) ixtop=xoff-1;
-					if( iy==-1 ) iytop=yoff-1;
-					if( iz==-1 ) iztop=zoff-1;
-					
-					double ustar1, ustar2, ustar3, uhat;			
-					double fac = 0.5;//0.25;
-					double flux;;
-				    if( ix == -1 && iy%2==0 && iz%2==0 )
+
+					int ixtop = (int)(0.5 * (double)(ix)) + xoff;
+					int iytop = (int)(0.5 * (double)(iy)) + yoff;
+					int iztop = (int)(0.5 * (double)(iz)) + zoff;
+
+					if (ix == -1)
+						ixtop = xoff - 1;
+					if (iy == -1)
+						iytop = yoff - 1;
+					if (iz == -1)
+						iztop = zoff - 1;
+
+					double ustar1, ustar2, ustar3, uhat;
+					double fac = 0.5; // 0.25;
+					double flux;
+					;
+					if (ix == -1 && iy % 2 == 0 && iz % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-							{		
-								ustar1 = interp2( (*utop)(ixtop,iytop-1,iztop-1),(*utop)(ixtop,iytop,iztop-1),(*utop)(ixtop,iytop+1,iztop-1), fac*((double)j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop,iytop-1,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop,iytop+1,iztop), fac*((double)j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop,iytop-1,iztop+1),(*utop)(ixtop,iytop,iztop+1),(*utop)(ixtop,iytop+1,iztop+1), fac*((double)j-0.5) );
-									
-								uhat   = interp2( /*-1.0, 0.0, 1.0, */ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-							
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
+							{
+								ustar1 = interp2((*utop)(ixtop, iytop - 1, iztop - 1), (*utop)(ixtop, iytop, iztop - 1), (*utop)(ixtop, iytop + 1, iztop - 1), fac * ((double)j - 0.5));
+								ustar2 = interp2((*utop)(ixtop, iytop - 1, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop, iytop + 1, iztop), fac * ((double)j - 0.5));
+								ustar3 = interp2((*utop)(ixtop, iytop - 1, iztop + 1), (*utop)(ixtop, iytop, iztop + 1), (*utop)(ixtop, iytop + 1, iztop + 1), fac * ((double)j - 0.5));
+
+								uhat = interp2(/*-1.0, 0.0, 1.0, */ ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix,iy+j,iz+k) = 0.0;//(*utop)(ixtop,iytop,iztop);//interp2( -1.5, 0.0, 1.0, uhat, (*u)(ix+1,iy+j,iz+k), (*u)(ix+2,iy+j,iz+k), -1.0 );
-								
-								(*u)(ix,iy+j,iz+k) = interp2left( uhat, (*u)(ix+1,iy+j,iz+k), (*u)(ix+2,iy+j,iz+k) );
-								
-								flux += ((*u)(ix+1,iy+j,iz+k)-(*u)(ix,iy+j,iz+k));
+
+								(*u)(ix, iy + j, iz + k) = interp2left(uhat, (*u)(ix + 1, iy + j, iz + k), (*u)(ix + 2, iy + j, iz + k));
+
+								flux += ((*u)(ix + 1, iy + j, iz + k) - (*u)(ix, iy + j, iz + k));
 							}
-						
-						
-						
+
 						flux /= 4.0;
-						
-						double dflux = ((*utop)(ixtop+1,iytop,iztop)-(*utop)(ixtop,iytop,iztop))/2.0 - flux;
-						
-						//dflux *= 2.0;
-						
-						if( bcf )
-							for( int j=0;j<=1;j++)
-								for( int k=0;k<=1;k++)
-									(*u)(ix,iy+j,iz+k) -= dflux;
+
+						double dflux = ((*utop)(ixtop + 1, iytop, iztop) - (*utop)(ixtop, iytop, iztop)) / 2.0 - flux;
+
+						// dflux *= 2.0;
+
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix, iy + j, iz + k) -= dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop+1,iytop,iztop) - 2.0*flux;
-						
-						
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop + 1, iytop, iztop) - 2.0 * flux;
 					}
 					// right boundary
-					if( ix == nx && iy%2==0 && iz%2==0 )
+					if (ix == nx && iy % 2 == 0 && iz % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-							{		
-								ustar1 = interp2( (*utop)(ixtop,iytop-1,iztop-1),(*utop)(ixtop,iytop,iztop-1),(*utop)(ixtop,iytop+1,iztop-1), fac*((double)j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop,iytop-1,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop,iytop+1,iztop), fac*((double)j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop,iytop-1,iztop+1),(*utop)(ixtop,iytop,iztop+1),(*utop)(ixtop,iytop+1,iztop+1), fac*((double)j-0.5) );
-								
-								uhat   = interp2( -1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-								
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
+							{
+								ustar1 = interp2((*utop)(ixtop, iytop - 1, iztop - 1), (*utop)(ixtop, iytop, iztop - 1), (*utop)(ixtop, iytop + 1, iztop - 1), fac * ((double)j - 0.5));
+								ustar2 = interp2((*utop)(ixtop, iytop - 1, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop, iytop + 1, iztop), fac * ((double)j - 0.5));
+								ustar3 = interp2((*utop)(ixtop, iytop - 1, iztop + 1), (*utop)(ixtop, iytop, iztop + 1), (*utop)(ixtop, iytop + 1, iztop + 1), fac * ((double)j - 0.5));
+
+								uhat = interp2(-1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix,iy+j,iz+k) = 0.0;(*utop)(ixtop,iytop,iztop);//interp2( 1.5, 0.0, -1.0, uhat, (*u)(ix-1,iy+j,iz+k), (*u)(ix-2,iy+j,iz+k), 1.0 );
-								(*u)(ix,iy+j,iz+k) = interp2right( (*u)(ix-2,iy+j,iz+k), (*u)(ix-1,iy+j,iz+k), uhat );
-								flux += ((*u)(ix,iy+j,iz+k)-(*u)(ix-1,iy+j,iz+k));
+								(*u)(ix, iy + j, iz + k) = interp2right((*u)(ix - 2, iy + j, iz + k), (*u)(ix - 1, iy + j, iz + k), uhat);
+								flux += ((*u)(ix, iy + j, iz + k) - (*u)(ix - 1, iy + j, iz + k));
 							}
 						flux /= 4.0;
-						
-						
-						double dflux = ((*utop)(ixtop,iytop,iztop)-(*utop)(ixtop-1,iytop,iztop))/2.0 - flux;
-						//dflux *= 2.0;
-						
-						if( bcf )
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-								(*u)(ix,iy+j,iz+k) += dflux;
+
+						double dflux = ((*utop)(ixtop, iytop, iztop) - (*utop)(ixtop - 1, iytop, iztop)) / 2.0 - flux;
+						// dflux *= 2.0;
+
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix, iy + j, iz + k) += dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop-1,iytop,iztop) + 2.0*flux;
-						
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop - 1, iytop, iztop) + 2.0 * flux;
 					}
 					// bottom boundary
-					if( iy == -1 && ix%2==0 && iz%2==0 )
+					if (iy == -1 && ix % 2 == 0 && iz % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
 							{
-								ustar1 = interp2( (*utop)(ixtop-1,iytop,iztop-1),(*utop)(ixtop,iytop,iztop-1),(*utop)(ixtop+1,iytop,iztop-1), fac*(j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop-1,iytop,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop+1,iytop,iztop), fac*(j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop-1,iytop,iztop+1),(*utop)(ixtop,iytop,iztop+1),(*utop)(ixtop+1,iytop,iztop+1), fac*(j-0.5) );
-								
-								uhat   = interp2( -1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-								
+								ustar1 = interp2((*utop)(ixtop - 1, iytop, iztop - 1), (*utop)(ixtop, iytop, iztop - 1), (*utop)(ixtop + 1, iytop, iztop - 1), fac * (j - 0.5));
+								ustar2 = interp2((*utop)(ixtop - 1, iytop, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop + 1, iytop, iztop), fac * (j - 0.5));
+								ustar3 = interp2((*utop)(ixtop - 1, iytop, iztop + 1), (*utop)(ixtop, iytop, iztop + 1), (*utop)(ixtop + 1, iytop, iztop + 1), fac * (j - 0.5));
+
+								uhat = interp2(-1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix+j,iy,iz+k) = 0.0;(*utop)(ixtop,iytop,iztop);//interp2( -1.5, 0.0, 1.0, uhat, (*u)(ix+j,iy+1,iz+k), (*u)(ix+j,iy+2,iz+k), -1.0 );
-								(*u)(ix+j,iy,iz+k) = interp2left( uhat, (*u)(ix+j,iy+1,iz+k), (*u)(ix+j,iy+2,iz+k) );
-								
-								flux += ((*u)(ix+j,iy+1,iz+k)-(*u)(ix+j,iy,iz+k));
+								(*u)(ix + j, iy, iz + k) = interp2left(uhat, (*u)(ix + j, iy + 1, iz + k), (*u)(ix + j, iy + 2, iz + k));
+
+								flux += ((*u)(ix + j, iy + 1, iz + k) - (*u)(ix + j, iy, iz + k));
 							}
 						flux /= 4.0;
 						//(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop+1,iztop) - flux;
-						double dflux = ((*utop)(ixtop,iytop+1,iztop)-(*utop)(ixtop,iytop,iztop))/2.0 - flux;
-						//dflux *= 2.0;
-						if( bcf )
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-								(*u)(ix+j,iy,iz+k) -= dflux;
+						double dflux = ((*utop)(ixtop, iytop + 1, iztop) - (*utop)(ixtop, iytop, iztop)) / 2.0 - flux;
+						// dflux *= 2.0;
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix + j, iy, iz + k) -= dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop+1,iztop) - 2.0*flux;
-						
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop, iytop + 1, iztop) - 2.0 * flux;
 					}
 					// top boundary
-					if( iy == ny && ix%2==0 && iz%2==0 )
+					if (iy == ny && ix % 2 == 0 && iz % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-							{		
-								ustar1 = interp2( (*utop)(ixtop-1,iytop,iztop-1),(*utop)(ixtop,iytop,iztop-1),(*utop)(ixtop+1,iytop,iztop-1), fac*(j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop-1,iytop,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop+1,iytop,iztop), fac*(j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop-1,iytop,iztop+1),(*utop)(ixtop,iytop,iztop+1),(*utop)(ixtop+1,iytop,iztop+1), fac*(j-0.5) );
-								
-								uhat   = interp2( -1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-								
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
+							{
+								ustar1 = interp2((*utop)(ixtop - 1, iytop, iztop - 1), (*utop)(ixtop, iytop, iztop - 1), (*utop)(ixtop + 1, iytop, iztop - 1), fac * (j - 0.5));
+								ustar2 = interp2((*utop)(ixtop - 1, iytop, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop + 1, iytop, iztop), fac * (j - 0.5));
+								ustar3 = interp2((*utop)(ixtop - 1, iytop, iztop + 1), (*utop)(ixtop, iytop, iztop + 1), (*utop)(ixtop + 1, iytop, iztop + 1), fac * (j - 0.5));
+
+								uhat = interp2(-1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix+j,iy,iz+k) = 0.0;(*utop)(ixtop,iytop,iztop);//interp2( 1.5, 0.0, -1.0, uhat, (*u)(ix+j,iy-1,iz+k), (*u)(ix+j,iy-2,iz+k), 1.0 );
-								(*u)(ix+j,iy,iz+k) = interp2right( (*u)(ix+j,iy-2,iz+k), (*u)(ix+j,iy-1,iz+k), uhat  );
-								
-								flux += ((*u)(ix+j,iy,iz+k)-(*u)(ix+j,iy-1,iz+k));
+								(*u)(ix + j, iy, iz + k) = interp2right((*u)(ix + j, iy - 2, iz + k), (*u)(ix + j, iy - 1, iz + k), uhat);
+
+								flux += ((*u)(ix + j, iy, iz + k) - (*u)(ix + j, iy - 1, iz + k));
 							}
 						flux /= 4.0;
 						//(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop-1,iztop) + flux;
-						double dflux = ((*utop)(ixtop,iytop,iztop)-(*utop)(ixtop,iytop-1,iztop))/2.0 - flux;
-						//dflux *= 2.0;
-						if( bcf )
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-								(*u)(ix+j,iy,iz+k) += dflux;
+						double dflux = ((*utop)(ixtop, iytop, iztop) - (*utop)(ixtop, iytop - 1, iztop)) / 2.0 - flux;
+						// dflux *= 2.0;
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix + j, iy, iz + k) += dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop-1,iztop) + 2.0*flux;
-						
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop, iytop - 1, iztop) + 2.0 * flux;
 					}
 					// front boundary
-					if( iz == -1 && ix%2==0 && iy%2==0 )
+					if (iz == -1 && ix % 2 == 0 && iy % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-							{		
-								ustar1 = interp2( (*utop)(ixtop-1,iytop-1,iztop),(*utop)(ixtop,iytop-1,iztop),(*utop)(ixtop+1,iytop-1,iztop), fac*(j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop-1,iytop,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop+1,iytop,iztop), fac*(j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop-1,iytop+1,iztop),(*utop)(ixtop,iytop+1,iztop),(*utop)(ixtop+1,iytop+1,iztop), fac*(j-0.5) );
-								
-								uhat   = interp2( -1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-								
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
+							{
+								ustar1 = interp2((*utop)(ixtop - 1, iytop - 1, iztop), (*utop)(ixtop, iytop - 1, iztop), (*utop)(ixtop + 1, iytop - 1, iztop), fac * (j - 0.5));
+								ustar2 = interp2((*utop)(ixtop - 1, iytop, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop + 1, iytop, iztop), fac * (j - 0.5));
+								ustar3 = interp2((*utop)(ixtop - 1, iytop + 1, iztop), (*utop)(ixtop, iytop + 1, iztop), (*utop)(ixtop + 1, iytop + 1, iztop), fac * (j - 0.5));
+
+								uhat = interp2(-1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix+j,iy+k,iz) = 0.0;(*utop)(ixtop,iytop,iztop);//interp2( -1.5, 0.0, 1.0, uhat, (*u)(ix+j,iy+k,iz+1), (*u)(ix+j,iy+k,iz+2), -1.0 );
-								(*u)(ix+j,iy+k,iz) = interp2left( uhat, (*u)(ix+j,iy+k,iz+1), (*u)(ix+j,iy+k,iz+2) );
-								
-								flux += ((*u)(ix+j,iy+k,iz+1)-(*u)(ix+j,iy+k,iz));
+								(*u)(ix + j, iy + k, iz) = interp2left(uhat, (*u)(ix + j, iy + k, iz + 1), (*u)(ix + j, iy + k, iz + 2));
+
+								flux += ((*u)(ix + j, iy + k, iz + 1) - (*u)(ix + j, iy + k, iz));
 							}
 						flux /= 4.0;
 						//(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop,iztop+1) - flux;
-						double dflux = ((*utop)(ixtop,iytop,iztop+1)-(*utop)(ixtop,iytop,iztop))/2.0 - flux;
-						//dflux *= 2.0;
-						if( bcf )
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-								(*u)(ix+j,iy+k,iz) -= dflux;
+						double dflux = ((*utop)(ixtop, iytop, iztop + 1) - (*utop)(ixtop, iytop, iztop)) / 2.0 - flux;
+						// dflux *= 2.0;
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix + j, iy + k, iz) -= dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop,iztop+1) - 2.0*flux;
-						
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop, iytop, iztop + 1) - 2.0 * flux;
 					}
 					// back boundary
-					if( iz == nz && ix%2==0 && iy%2==0 )
+					if (iz == nz && ix % 2 == 0 && iy % 2 == 0)
 					{
 						flux = 0.0;
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-							{		
-								ustar1 = interp2( (*utop)(ixtop-1,iytop-1,iztop),(*utop)(ixtop,iytop-1,iztop),(*utop)(ixtop+1,iytop-1,iztop), fac*(j-0.5) );
-								ustar2 = interp2( (*utop)(ixtop-1,iytop,iztop),(*utop)(ixtop,iytop,iztop),(*utop)(ixtop+1,iytop,iztop), fac*(j-0.5) );
-								ustar3 = interp2( (*utop)(ixtop-1,iytop+1,iztop),(*utop)(ixtop,iytop+1,iztop),(*utop)(ixtop+1,iytop+1,iztop), fac*(j-0.5) );
-								
-								uhat   = interp2( -1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac*((double)k-0.5) );
-								
+						for (int j = 0; j <= 1; j++)
+							for (int k = 0; k <= 1; k++)
+							{
+								ustar1 = interp2((*utop)(ixtop - 1, iytop - 1, iztop), (*utop)(ixtop, iytop - 1, iztop), (*utop)(ixtop + 1, iytop - 1, iztop), fac * (j - 0.5));
+								ustar2 = interp2((*utop)(ixtop - 1, iytop, iztop), (*utop)(ixtop, iytop, iztop), (*utop)(ixtop + 1, iytop, iztop), fac * (j - 0.5));
+								ustar3 = interp2((*utop)(ixtop - 1, iytop + 1, iztop), (*utop)(ixtop, iytop + 1, iztop), (*utop)(ixtop + 1, iytop + 1, iztop), fac * (j - 0.5));
+
+								uhat = interp2(-1.0, 0.0, 1.0, ustar1, ustar2, ustar3, fac * ((double)k - 0.5));
+
 								//(*u)(ix+j,iy+k,iz) = 0.0;(*utop)(ixtop,iytop,iztop);//interp2( 1.5, 0.0, -1.0, uhat, (*u)(ix+j,iy+k,iz-1), (*u)(ix+j,iy+k,iz-2), 1.0 );
-								(*u)(ix+j,iy+k,iz) = interp2right( (*u)(ix+j,iy+k,iz-2), (*u)(ix+j,iy+k,iz-1), uhat );
-								
-								flux += ((*u)(ix+j,iy+k,iz)-(*u)(ix+j,iy+k,iz-1));
+								(*u)(ix + j, iy + k, iz) = interp2right((*u)(ix + j, iy + k, iz - 2), (*u)(ix + j, iy + k, iz - 1), uhat);
+
+								flux += ((*u)(ix + j, iy + k, iz) - (*u)(ix + j, iy + k, iz - 1));
 							}
 						flux /= 4.0;
 						//(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop,iztop-1) + flux;
-						double dflux = ((*utop)(ixtop,iytop,iztop)-(*utop)(ixtop,iytop,iztop-1))/2.0 - flux;
-						//dflux *= 2.0;
-						if( bcf )
-						for( int j=0;j<=1;j++)
-							for( int k=0;k<=1;k++)
-								(*u)(ix+j,iy+k,iz) += dflux;
+						double dflux = ((*utop)(ixtop, iytop, iztop) - (*utop)(ixtop, iytop, iztop - 1)) / 2.0 - flux;
+						// dflux *= 2.0;
+						if (bcf)
+							for (int j = 0; j <= 1; j++)
+								for (int k = 0; k <= 1; k++)
+									(*u)(ix + j, iy + k, iz) += dflux;
 						else
-							(*utop)(ixtop,iytop,iztop) = (*utop)(ixtop,iytop,iztop-1) + 2.0*flux;
+							(*utop)(ixtop, iytop, iztop) = (*utop)(ixtop, iytop, iztop - 1) + 2.0 * flux;
 					}
-					
 				}
 			}
-	
 }
 
-
 #if 1
-template< class S, class O, typename T >
-void solver<S,O,T>::setBC( unsigned ilevel )
+template <class S, class O>
+void solver<S, O>::setBC(unsigned ilevel)
 {
 	//... set only on level before additional refinement starts
-	//if( ilevel == m_ilevelmin )
-	if( ilevel == m_ilevelmin )
+	// if( ilevel == m_ilevelmin )
+	if (ilevel == m_ilevelmin)
 	{
-		MeshvarBnd<T> *u = m_pu->get_grid(ilevel);
-		//int nbnd = u->m_nbnd,
+		MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
+		// int nbnd = u->m_nbnd,
 		int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-		
+				nx = u->size(0),
+				ny = u->size(1),
+				nz = u->size(2);
+
 		/*for( int ix=-nbnd; ix<nx+nbnd; ++ix )
 			for( int iy=-nbnd; iy<ny+nbnd; ++iy )
 				for( int iz=-nbnd; iz<nz+nbnd; ++iz )
 					if( ix<0||ix>=nx||iy<0||iy>=ny||iz<0||iz>=nz )
 						(*u)(ix,iy,iz) = (*m_pubnd)(ix,iy,iz);*/
-		
-		
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
-			{
-				(*u)(-1,iy,iz) = 2.0*(*m_pubnd)(-1,iy,iz) - (*u)(0,iy,iz);
-				(*u)(nx,iy,iz) = 2.0*(*m_pubnd)(nx,iy,iz) - (*u)(nx-1,iy,iz);;
-			}
-		
-		for( int ix=0; ix<nx; ++ix )
-			for( int iz=0; iz<nz; ++iz )
-			{
-				(*u)(ix,-1,iz) = 2.0*(*m_pubnd)(ix,-1,iz) - (*u)(ix,0,iz);
-				(*u)(ix,ny,iz) = 2.0*(*m_pubnd)(ix,ny,iz) - (*u)(ix,ny-1,iz);
-			}
-		
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-			{
-				(*u)(ix,iy,-1) = 2.0*(*m_pubnd)(ix,iy,-1) - (*u)(ix,iy,0);
-				(*u)(ix,iy,nz) = 2.0*(*m_pubnd)(ix,iy,nz) - (*u)(ix,iy,nz-1);
-			}		
-		
-		
-		
-	}/*else if( ilevel < m_ilevelmin ) {
-		MeshvarBnd<T> *u = m_pu->get_grid(ilevel);
-		int nbnd = u->m_nbnd,
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-		
-		for( int ix=-nbnd; ix<nx+nbnd; ++ix )
-			for( int iy=-nbnd; iy<ny+nbnd; ++iy )
-				for( int iz=-nbnd; iz<nz+nbnd; ++iz )
-					if( ix<0||ix>=nx||iy<0||iy>=ny||iz<0||iz>=nz )
-						(*u)(ix,iy,iz) = 0.0;
-	}*/
 
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
+			{
+				(*u)(-1, iy, iz) = 2.0 * (*m_pubnd)(-1, iy, iz) - (*u)(0, iy, iz);
+				(*u)(nx, iy, iz) = 2.0 * (*m_pubnd)(nx, iy, iz) - (*u)(nx - 1, iy, iz);
+				;
+			}
 
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iz = 0; iz < nz; ++iz)
+			{
+				(*u)(ix, -1, iz) = 2.0 * (*m_pubnd)(ix, -1, iz) - (*u)(ix, 0, iz);
+				(*u)(ix, ny, iz) = 2.0 * (*m_pubnd)(ix, ny, iz) - (*u)(ix, ny - 1, iz);
+			}
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+			{
+				(*u)(ix, iy, -1) = 2.0 * (*m_pubnd)(ix, iy, -1) - (*u)(ix, iy, 0);
+				(*u)(ix, iy, nz) = 2.0 * (*m_pubnd)(ix, iy, nz) - (*u)(ix, iy, nz - 1);
+			}
+
+	} /*else if( ilevel < m_ilevelmin ) {
+		 MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
+		 int nbnd = u->m_nbnd,
+		 nx = u->size(0),
+		 ny = u->size(1),
+		 nz = u->size(2);
+
+		 for( int ix=-nbnd; ix<nx+nbnd; ++ix )
+			 for( int iy=-nbnd; iy<ny+nbnd; ++iy )
+				 for( int iz=-nbnd; iz<nz+nbnd; ++iz )
+					 if( ix<0||ix>=nx||iy<0||iy>=ny||iz<0||iz>=nz )
+						 (*u)(ix,iy,iz) = 0.0;
+	 }*/
 }
 
 #else
 
 //... enforce periodic boundary conditions
-template< class S, class O, typename T >
-void solver<S,O,T>::setBC( unsigned ilevel )
+template <class S, class O>
+void solver<S, O>::setBC(unsigned ilevel)
 {
-	MeshvarBnd<T> *u = m_pu->get_grid(ilevel);
-	
+	MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
+
 	//... set only on level before additional refinement starts
-	if( ilevel <= m_ilevelmin )
+	if (ilevel <= m_ilevelmin)
 	{
-		
+
 		int nbnd = u->m_nbnd,
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-		
+				nx = u->size(0),
+				ny = u->size(1),
+				nz = u->size(2);
+
 		//(*u)(0,0,0) = 0.0;
-		
-		
+
 		double sum = 0.0;
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
-					sum += (*u)(ix,iy,iz);
-		sum /= (nx*ny*nz);
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
-					(*u)(ix,iy,iz) -= sum;
-		
-		
-		
-		
-		for( int iy=0; iy<ny; ++iy )
-			for( int iz=0; iz<nz; ++iz )
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+					sum += (*u)(ix, iy, iz);
+		sum /= (nx * ny * nz);
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
+					(*u)(ix, iy, iz) -= sum;
+
+		for (int iy = 0; iy < ny; ++iy)
+			for (int iz = 0; iz < nz; ++iz)
 			{
-				(*u)(-1,iy,iz) = (*u)(nx-1,iy,iz);
-				(*u)(nx,iy,iz) = (*u)(0,iy,iz);
+				(*u)(-1, iy, iz) = (*u)(nx - 1, iy, iz);
+				(*u)(nx, iy, iz) = (*u)(0, iy, iz);
 			}
-		
-		for( int ix=0; ix<nx; ++ix )
-			for( int iz=0; iz<nz; ++iz )
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iz = 0; iz < nz; ++iz)
 			{
-				(*u)(ix,-1,iz) = (*u)(ix,ny-1,iz);
-				(*u)(ix,ny,iz) = (*u)(ix,0,iz);
+				(*u)(ix, -1, iz) = (*u)(ix, ny - 1, iz);
+				(*u)(ix, ny, iz) = (*u)(ix, 0, iz);
 			}
-					
-		for( int ix=0; ix<nx; ++ix )
-			for( int iy=0; iy<ny; ++iy )
+
+		for (int ix = 0; ix < nx; ++ix)
+			for (int iy = 0; iy < ny; ++iy)
 			{
-				(*u)(ix,iy,-1) = (*u)(ix,iy,nz-1);
-				(*u)(ix,iy,nz) = (*u)(ix,iy,0);
-			}	
-		
-		
-		
+				(*u)(ix, iy, -1) = (*u)(ix, iy, nz - 1);
+				(*u)(ix, iy, nz) = (*u)(ix, iy, 0);
+			}
 	}
-	
 }
 #endif
 
-
 //... enforce periodic boundary conditions
-template< class S, class O, typename T >
-void solver<S,O,T>::make_periodic( MeshvarBnd<T> *u )
+template <class S, class O>
+void solver<S, O>::make_periodic(MeshvarBnd<real_t> *u)
 {
 
 	int
-		nx = u->size(0), 
-		ny = u->size(1), 
-		nz = u->size(2);
-		
+			nx = u->size(0),
+			ny = u->size(1),
+			nz = u->size(2);
 
-	#pragma omp parallel
+#pragma omp parallel
 	{
-		
-		if( u->offset(0) == 0 )
-			for( int iy=0; iy<ny; ++iy )
-				for( int iz=0; iz<nz; ++iz )
+
+		if (u->offset(0) == 0)
+			for (int iy = 0; iy < ny; ++iy)
+				for (int iz = 0; iz < nz; ++iz)
 				{
-					(*u)(-1,iy,iz) = (*u)(nx-1,iy,iz);
-					(*u)(nx,iy,iz) = (*u)(0,iy,iz);
+					(*u)(-1, iy, iz) = (*u)(nx - 1, iy, iz);
+					(*u)(nx, iy, iz) = (*u)(0, iy, iz);
 				}
-		
-		if( u->offset(1) == 0 )
-			for( int ix=0; ix<nx; ++ix )
-				for( int iz=0; iz<nz; ++iz )
+
+		if (u->offset(1) == 0)
+			for (int ix = 0; ix < nx; ++ix)
+				for (int iz = 0; iz < nz; ++iz)
 				{
-					(*u)(ix,-1,iz) = (*u)(ix,ny-1,iz);
-					(*u)(ix,ny,iz) = (*u)(ix,0,iz);
+					(*u)(ix, -1, iz) = (*u)(ix, ny - 1, iz);
+					(*u)(ix, ny, iz) = (*u)(ix, 0, iz);
 				}
-		
-		if( u->offset(2) == 0 )
-			for( int ix=0; ix<nx; ++ix )
-				for( int iy=0; iy<ny; ++iy )
+
+		if (u->offset(2) == 0)
+			for (int ix = 0; ix < nx; ++ix)
+				for (int iy = 0; iy < ny; ++iy)
 				{
-					(*u)(ix,iy,-1) = (*u)(ix,iy,nz-1);
-					(*u)(ix,iy,nz) = (*u)(ix,iy,0);
-				}									  
-			
+					(*u)(ix, iy, -1) = (*u)(ix, iy, nz - 1);
+					(*u)(ix, iy, nz) = (*u)(ix, iy, 0);
+				}
 	}
 }
 
 END_MULTIGRID_NAMESPACE
- 
-#endif
diff --git a/src/system_stat.hh b/src/system_stat.hh
new file mode 100644
index 0000000..1a30566
--- /dev/null
+++ b/src/system_stat.hh
@@ -0,0 +1,194 @@
+// This file is part of MUSIC2
+// A software package to generate ICs for cosmological simulations
+// Copyright (C) 2020-23 by Oliver Hahn
+// 
+// MUSIC2 is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// MUSIC2 is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#pragma once
+
+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <mach/mach.h>
+#include <mach/vm_statistics.h>
+#include <mach/mach_types.h>
+#include <mach/mach_init.h>
+#include <mach/mach_host.h>
+#include <unistd.h>
+#elif __linux__
+#include <cstring>
+#include <cstdio>
+#include <strings.h>
+#endif
+
+#include <string>
+
+namespace SystemStat
+{
+
+
+
+
+class Cpu
+{
+public:
+    Cpu() {}
+
+    std::string get_CPUstring() const
+    {
+#ifdef __APPLE__
+        char buffer[1024];
+        size_t size = sizeof(buffer);
+        if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, NULL, 0) < 0)
+        {
+            return "";
+        }
+        return std::string(buffer);
+#elif __linux__
+        std::string str = "";
+        FILE *cpuinfo = fopen("/proc/cpuinfo", "rb");
+        char *arg = 0;
+        size_t size = 0;
+        while (getdelim(&arg, &size, '\n', cpuinfo) != -1)
+        {
+            if (strncmp(arg, "model name", 10) == 0)
+            {
+                str = std::string(arg + 13);
+                break;
+            }
+        }
+        free(arg);
+        fclose(cpuinfo);
+        //remove newline characters from string
+        str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
+        return str;
+#endif
+    }
+};
+
+class Memory
+{
+private:
+    size_t total;
+    size_t avail;
+    size_t used;
+
+public:
+    Memory()
+        : total(0), avail(0), used(0)
+    {
+        this->get_statistics();
+    }
+
+    size_t get_TotalMem() const { return this->total; }
+    size_t get_AvailMem() const { return this->avail; }
+    size_t get_UsedMem() const { return this->used; }
+    void update() { this->get_statistics(); }
+
+protected:
+    int get_statistics(void)
+    {
+#ifdef __APPLE__
+        int64_t pagesize = int64_t(getpagesize());
+        int mib[2] = {CTL_HW, HW_MEMSIZE};
+        size_t length = sizeof(size_t);
+        sysctl(mib, 2, &this->total, &length, nullptr, 0);
+
+        vm_statistics64 vmstat;
+        natural_t mcount = HOST_VM_INFO64_COUNT;
+        if (host_statistics64(mach_host_self(), HOST_VM_INFO64, reinterpret_cast<host_info64_t>(&vmstat), &mcount) == KERN_SUCCESS)
+        {
+#if 1 // count inactive as available
+            this->avail = (int64_t(vmstat.free_count) +
+                           int64_t(vmstat.inactive_count)) *
+                          pagesize;
+            this->used = (int64_t(vmstat.active_count) +
+                          int64_t(vmstat.wire_count)) *
+                         pagesize;
+#else // count inactive as unavailable
+            this->avail = int64_t(vmstat.free_count) * pagesize;
+            this->used = (int64_t(vmstat.active_count) +
+                          int64_t(vmstat.inactive_count) +
+                          int64_t(vmstat.wire_count)) *
+                         pagesize;
+#endif
+        }
+
+#elif __linux__
+        FILE *fd;
+        char buf[1024];
+        if ((fd = fopen("/proc/meminfo", "r")))
+        {
+            while (1)
+            {
+                if (fgets(buf, 500, fd) != buf)
+                    break;
+                if (bcmp(buf, "MemTotal", 8) == 0)
+                {
+                    this->total = atoll(buf + 10) * 1024; // in Mb
+                }
+                if (strncmp(buf, "Committed_AS", 12) == 0)
+                {
+                    this->used = atoll(buf + 14) * 1024; // in Mb
+                }
+                // if(strncmp(buf, "SwapTotal", 9) == 0)
+                // {
+                //     *SwapTotal = atoll(buf + 11);
+                // }
+                // if(strncmp(buf, "SwapFree", 8) == 0)
+                // {
+                //     *SwapFree = atoll(buf + 10);
+                // }
+            }
+            fclose(fd);
+        }
+        this->avail = this->total - this->used;
+
+#endif
+        return 0;
+    }
+};
+
+#include <cstdlib>
+#include <string>
+#include <sys/utsname.h>
+
+class Kernel
+{
+public:
+    struct info_t
+    {
+        std::string kernel;
+        std::uint32_t major;
+        std::uint32_t minor;
+        std::uint32_t patch;
+        std::uint32_t build_number;
+    };
+
+    Kernel() {}
+
+    info_t get_kernel_info()
+    {
+        utsname uts;
+        uname(&uts);
+        char *marker = uts.release;
+        const std::uint32_t major = std::strtoul(marker, &marker, 10);
+        const std::uint32_t minor = std::strtoul(marker + 1, &marker, 10);
+        const std::uint32_t patch = std::strtoul(marker + 1, &marker, 10);
+        const std::uint32_t build_number = std::strtoul(marker + 1, nullptr, 10);
+        std::string kernel = uts.sysname;
+        return {kernel, major, minor, patch, build_number};
+    }
+};
+
+} /* namespace SystemStat */
diff --git a/src/transfer_function.hh b/src/transfer_function.hh
index 35fa92b..cf368b3 100644
--- a/src/transfer_function.hh
+++ b/src/transfer_function.hh
@@ -294,10 +294,10 @@ protected:
 		
 		double fftnorm = 1.0/N;
 		
-		fftw_complex *in, *out;
+		complex_t *in, *out;
 		
-		in = new fftw_complex[N];
-		out = new fftw_complex[N];
+		in = new complex_t[N];
+		out = new complex_t[N];
 		
 		//... perform anti-ringing correction from Hamilton (2000)
 		k0r0 = krgood( mu, q, dlnr, k0r0 );
@@ -341,24 +341,10 @@ protected:
 		}
 		ofsk.close();
 		
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-		fftwf_plan p,ip;
-		p = fftwf_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
-		ip = fftwf_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
-		fftwf_execute(p);
-	#else
-		fftw_plan p,ip;
-		p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
-		ip = fftw_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
-		fftw_execute(p);
-	#endif
-#else
-		fftw_plan p,ip;
-		p = fftw_create_plan(N, FFTW_FORWARD, FFTW_ESTIMATE);
-		ip = fftw_create_plan(N, FFTW_BACKWARD, FFTW_ESTIMATE);
-		fftw_one(p, in, out);
-#endif
+		fftw_plan_t p,ip;
+		p = FFTW_API(plan_dft_1d)(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+		ip = FFTW_API(plan_dft_1d)(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
+		FFTW_API(execute)(p);
 		
 		//... compute the Hankel transform by convolution with the Bessel function
 		for( unsigned i=0; i<N; ++i )
@@ -429,16 +415,8 @@ protected:
 #endif
 
 		}
-		
-#ifdef FFTW3
-	#ifdef SINGLE_PRECISION
-		fftwf_execute(ip);
-	#else
-		fftw_execute(ip);
-	#endif
-#else
-		fftw_one(ip, out, in);
-#endif
+
+		FFTW_API(execute)(ip);
 		
 		rr.assign(N,0.0);
 		TT.assign(N,0.0);
@@ -481,14 +459,9 @@ protected:
 		
 		delete[] in;
 		delete[] out;
-		
-#if defined(FFTW3) && defined(SINGLE_PRECISION)
-		fftwf_destroy_plan(p);
-		fftwf_destroy_plan(ip);
-#else
-		fftw_destroy_plan(p);
-		fftw_destroy_plan(ip);
-#endif
+
+		FFTW_API(destroy_plan)(p);
+		FFTW_API(destroy_plan)(ip);
 	}
 	std::vector<real_t> m_xtable,m_ytable,m_dytable;
 	double m_xmin, m_xmax, m_dx, m_rdx;