Merged in develop (pull request #24)

Merge Panphasia development into master branch
2024-09-19 17:03:45 +02:00 · 2021-11-11 20:16:13 +00:00 · 2021-11-11 20:16:13 +00:00 · 114b81051e
commit 114b81051e
parent 2fad483587 3607afb244
35 changed files with 10250 additions and 529 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -149,8 +149,11 @@ elseif("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU")
 endif()
 endif(ENABLE_PANPHASIA)
 ########################################################################################################################
+# PANPHASIA HO (High-Order, new version)
+# option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON)
+########################################################################################################################
 # INCLUDES
-include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/external)

 # SOURCES
 # get all the *.cc files in the subfolders
@ -170,7 +173,12 @@ if(ENABLE_PANPHASIA)
 list (APPEND SOURCES 
  ${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f
  ${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90
+  #${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c
+  ${PROJECT_SOURCE_DIR}/external/panphasia_ho/high_order_panphasia_routines.c
+  ${PROJECT_SOURCE_DIR}/external/panphasia_ho/pan_mpi_routines.c
+  ${PROJECT_SOURCE_DIR}/external/panphasia_ho/uniform_rand_threefry4x64.c
 )
+# target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
 endif()

 # project configuration header
@ -246,6 +254,10 @@ if(ENABLE_PANPHASIA)
  target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA")
 endif(ENABLE_PANPHASIA)

+# if(ENABLE_PANPHASIA_HO)
+#   target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO")
+# endif(ENABLE_PANPHASIA_HO)
+
 if(ENABLE_PLT)
  target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT")
 endif(ENABLE_PLT)
@ -272,4 +284,4 @@ if(ENABLE_GENERICIO)
  include(${CMAKE_CURRENT_SOURCE_DIR}/external/genericio.cmake)
  target_link_libraries(${PRGNAME} PRIVATE genericio::genericio_mpi)
  target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_GENERICIO")
-endif()
+endif()
--- a/example.conf
+++ b/example.conf
@ -16,7 +16,7 @@ LPTorder        = 3        # order of the LPT to be used (1,2 or 3)
 DoBaryons       = no       # also do baryon ICs?
 DoBaryonVrel    = no       # if doing baryons, incl. also relative velocity to linear order?

-DoFixing        = yes      # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
+DoFixing        = no      # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
 DoInversion     = no       # invert phases (for paired simulations)

 ParticleLoad    = sc       # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x) 
@ -89,15 +89,16 @@ ztarget         = 2.5             # target redshift for CLASS module, output at
 ##> NGenIC compatible random number generator module compatible with V. Springel's original code
 ## (https://www.h-its.org/2014/11/05/ngenic-code/) as well as the 2LPT code by Pueblas&Scoccmiarro
 ## (https://cosmo.nyu.edu/roman/2LPT/)
-generator      = NGENIC
-seed           = 12345
+# generator      = NGENIC
+# seed           = 12345

 ##> The PANPHASIA generator uses a plugin based on original code by A. Jenkins
 ## Warning: Before using this module, please make sure you read and agree to the distinct license
 ## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php

-# generator      = PANPHASIA
-# descriptor     = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
+generator      = PANPHASIA_HO
+#descriptor     = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
+descriptor     = [Panph6,L20,(424060,82570,148256),S1,CH-999,Auriga_100_vol2]
 # PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512)

 ##> The MUSIC1 multi-scale random number generator is provided for convenience
--- a/external/panphasia_ho/LICENSE
+++ b/external/panphasia_ho/LICENSE
@ -0,0 +1,35 @@
+The code in this subdirectory is part of Adrian Jenkins' PANPHASIA packet,
+obtained from here http://icc.dur.ac.uk/Panphasia.php
+
+PANPHASIA is not published under the GPL but has its own proprietary license,
+make sure to visit the website before using the PANPHASIA functionality of
+MUSIC2 and register your name.
+
+We reproduce the licensing requirements for PANPHASIA from the above website 
+as retrieved on 2020/08/23:
+
+We make our software available for free but with a licence that includes the 
+condition that users make sure the phases of any new simulation volumes set up 
+using Panphasia are published.
+
+We are happy to collaborate with others on improving the software and providing 
+support for languages other than fortran. Contact: A.R.Jenkins@durham.ac.uk
+
+LICENCE:
+
+You are licensed to use this software free of charge on condition that:
+
+- you will publish the phase descriptors and reference Jenkins (13) for any new 
+  simulations that use Panphasia phases. You will pass on this condition to others 
+  for any software or data you make available publically or privately that makes 
+  use of Panphasia.
+- that you will ensure any publications using results derived from Panphasia will 
+  be submitted as a final version to arXiv prior to or coincident with publication 
+  in a journal.
+- that you report any bugs in this software as soon as confirmed to 
+  A.R.Jenkins@durham.ac.uk
+- that you understand that the software comes with no warranty and that is your 
+  responsibility to ensure that it is suitable for the purpose that you intend.
+- that you agree to having your name and email address stored for an indefinite 
+  period in the future electronically in a database as a record that you agreed 
+  the licence conditions.
--- a/external/panphasia_ho/PAN_FFTW3.h
+++ b/external/panphasia_ho/PAN_FFTW3.h
@ -0,0 +1,61 @@
+// Define macros for FFTW3 to allow swapping 
+// between single/double precision FTs
+
+// include CMake controlled configuration settings
+#pragma once
+#include "cmake_config.hh"
+
+#if defined(USE_PRECISION_DOUBLE)
+#define FOURIER_DOUBLE
+#endif
+
+#if defined(USE_PRECISION_LONGDOUBLE)
+#error "PANPHASIA-high-order does not currently support long double precision"
+#endif
+
+
+#ifdef FOURIER_DOUBLE
+  #define FFTW_REAL double
+  #define FFTW_PLAN                  fftw_plan
+  #define FFTW_DESTROY_PLAN          fftw_destroy_plan
+  #define FFTW_COMPLEX               fftw_complex
+  #define FFTW_MALLOC                fftw_malloc
+  #define FFTW_PLAN_DFT_1D           fftw_plan_dft_1d
+  #define FFTW_PLAN_dft_3D           fftw_plan_dft_3d
+  #define FFTW_EXECUTE               fftw_execute
+  #define FFTW_DESTROY_PLAN          fftw_destroy_plan
+  #define FFTW_FREE                  fftw_free
+  #define FFTW_ALLOC_COMPLEX         fftw_alloc_complex
+  #define FFTW_MPI_LOCAL_SIZE_MANY   fftw_mpi_local_size_many
+  #define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftw_mpi_local_size_many_transposed
+  #define FFTW_MPI_PLAN_MANY_TRANSPOSE  fftw_mpi_plan_many_transpose
+  #define FFTW_MPI_EXECUTE_R2R       fftw_mpi_execute_r2r
+  #define FFTW_PLAN_MANY_DFT         fftw_plan_many_dft
+  #define FFTW_MPI_LOCAL_SIZE_3D     fftw_mpi_local_size_3d
+  #define FFTW_MPI_PLAN_MANY_DTF     fftw_mpi_plan_many_dft
+  #define FFTW_MPI_PLAN_MANY_DTF_R2C fftw_mpi_plan_many_dft_r2c
+  #define FFTW_MPI_EXECUTE_DFT       fftw_mpi_execute_dft
+  #define FFTW_MPI_EXECUTE_DFT_R2C   fftw_mpi_execute_dft_r2c
+#else
+  #define FFTW_REAL float
+  #define FFTW_PLAN                  fftwf_plan
+  #define FFTW_DESTROY_PLAN          fftwf_destroy_plan
+  #define FFTW_COMPLEX               fftwf_complex
+  #define FFTW_MALLOC                fftwf_malloc
+  #define FFTW_PLAN_DFT_1D           fftwf_plan_dft_1d
+  #define FFTW_PLAN_dft_3D           fftwf_plan_dft_3d
+  #define FFTW_EXECUTE               fftwf_execute
+  #define FFTW_DESTROY_PLAN          fftwf_destroy_plan
+  #define FFTW_FREE                  fftwf_free
+  #define FFTW_ALLOC_COMPLEX         fftwf_alloc_complex
+  #define FFTW_MPI_LOCAL_SIZE_MANY   fftwf_mpi_local_size_many
+  #define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftwf_mpi_local_size_many_transposed
+  #define FFTW_MPI_PLAN_MANY_TRANSPOSE  fftwf_mpi_plan_many_transpose
+  #define FFTW_MPI_EXECUTE_R2R       fftwf_mpi_execute_r2r
+  #define FFTW_PLAN_MANY_DFT         fftwf_plan_many_dft
+  #define FFTW_MPI_LOCAL_SIZE_3D     fftwf_mpi_local_size_3d
+  #define FFTW_MPI_PLAN_MANY_DTF     fftwf_mpi_plan_many_dft
+  #define FFTW_MPI_PLAN_MANY_DTF_R2C fftwf_mpi_plan_many_dft_r2c
+  #define FFTW_MPI_EXECUTE_DFT       fftwf_mpi_execute_dft
+  #define FFTW_MPI_EXECUTE_DFT_R2C   fftwf_mpi_execute_dft_r2c
+#endif
--- a/external/panphasia_ho/README
+++ b/external/panphasia_ho/README
@ -0,0 +1,143 @@
+
+
+modules on COSMA7 
+
+intel_comp/2018    fftw/3.3.9cosma7               
+intel_mpi/2018     gsl/2.5            
+
+
+
+
+
+
+
+
+
+ The code calls a function to generate the k-space modes for
+a portion of the Panphasia field given an input descriptor.
+
+
+
+
+ Should be called early before significant memory is allocated.  It
+uses quite a bit of memory itself, but tidies up afterwards.
+
+Has OpenMP -     -DUSE_OPENMP     in the makefile
+
+
+
+ The routines support both single and double precision
+calculations in two senses.  
+
+The Fourier computations can be single or double precision
+
+MACROs    FFTW_REAL/FFTW_COMPLEX  used to define 'Fourier' precision types
+                                                    float or double.
+
+The Panphasia coefficients can be single or double precision
+
+MACROs   PAN_REAL/PAN_COMPLEX define the Panphasia precision - either
+                                                    float or double.
+
+ 
+To change the Fourier precision edit PAN_FFTW3.h - by default
+single precision unless 'FOURIER_DOUBLE' is defined.
+
+To change the Panphasia precision edit panphasia_functions.h and
+single precision unless 'PAN_DOUBLE_PRECISION' is defined.
+
+
+
+
+
+
+
+
+ Code description
+-------------------
+
+makefile
+
+CODE
+----
+
+
+main.c    - demo program only
+
+pan_mpi_routines.c   - contains MPI calls
+
+
+
+high_order_panphasia_routines.c    -  serial - contains some OpenMP
+uniform_rand_threefry4x64.c        -  serial - random generator and tests
+
+Include files
+--------------
+
+panphasia_functions.h    
+
+PAN_FFTW3.h           - MACROS for single/double precision FTs
+
+pan_matrices_order6.h   - matrix coefficients for 6th order scheme.
+
+
+threefry.h        -  Random generator 
+array.h  + features array .h files 
+
+
+
+
+Development notes:
+
+----------------------------------------------------
+
+14th April 2021
+
+
+
+   Found a bug in the OpenMP version.  Different numbers
+of threads led to a subset of Fourier modes having
+different values. The precise differences changed
+each time the code was run.
+
+ Debugged by turning of OpenMP section by section.
+The section which uses the spherical bessel functions
+turned out to be responsible.
+
+ The faulty version collapsed for 4 loops over
+multipole,x,y,z.   Changing this to a loop
+over multipoles, and collapsing 3 coordinate
+loops solved the problem.
+
+The variable index1 of the return field
+does not depend on the multipole, while
+index2 does.  Both index1 and index2 are
+private.  This means the return array
+(index 1) is updated several times.
+Presumably as these updated occur
+in parallel with the 4 loop collapsed
+version the return array was being
+corrupted sometimes.
+
+
+15th April
+------------
+
+  This version supercedes version given to 
+Oliver to add to MonofonIC clone.
+
+ Main difference is additional OpenMP
+statements and the ability to specify
+in the descriptor that modes less
+than of equal to some dimensionless
+integer wavenumber squared are set
+to the mean power.
+
+ Tested output on 1 core - with/without
+OpenMP.  Not tested with more than
+1 MPI rank.
+
+
+
+
+
--- a/external/panphasia_ho/array.h
+++ b/external/panphasia_ho/array.h
@ -0,0 +1,326 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _r123array_dot_h__
+#define _r123array_dot_h__
+#include "features/compilerfeatures.h"
+#include "features/sse.h"
+
+#ifndef __cplusplus
+#define CXXMETHODS(_N, W, T)
+#define CXXOVERLOADS(_N, W, T)
+#else
+
+#include <stddef.h>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <limits>
+#include <iostream>
+
+/** @defgroup arrayNxW The r123arrayNxW classes 
+
+    Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
+    It is functionally equivalent to the C++0x std::array<N, uintW_t>,
+    but does not require C++0x features or libraries.
+
+    In addition to meeting most of the requirements of a Container,
+    it also has a member function, incr(), which increments the zero-th
+    element and carrys overflows into higher indexed elements.  Thus,
+    by using incr(), sequences of up to 2^(N*W) distinct values
+    can be produced. 
+
+    If SSE is supported by the compiler, then the class
+    r123array1xm128i is also defined, in which the data member is an
+    array of one r123128i object.
+
+    @cond HIDDEN_FROM_DOXYGEN
+*/
+
+template <typename value_type>
+inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
+    value_type v=0;
+    for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
+        v |= ((value_type)(*p32++)) << (32*i);
+    return v;
+}
+
+// Work-alike methods and typedefs modeled on std::array:
+#define CXXMETHODS(_N, W, T)                                            \
+    typedef T value_type;                                               \
+    typedef T* iterator;                                                \
+    typedef const T* const_iterator;                                    \
+    typedef value_type& reference;                                      \
+    typedef const value_type& const_reference;                          \
+    typedef size_t size_type;                                           \
+    typedef ptrdiff_t difference_type;                                  \
+    typedef T* pointer;                                                 \
+    typedef const T* const_pointer;                                     \
+    typedef std::reverse_iterator<iterator> reverse_iterator;           \
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
+    /* Boost.array has static_size.  C++11 specializes tuple_size */    \
+    enum {static_size = _N};                                            \
+    R123_CUDA_DEVICE reference operator[](size_type i){return v[i];}                     \
+    R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];}        \
+    R123_CUDA_DEVICE reference at(size_type i){ if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE size_type size() const { return  _N; }                              \
+    R123_CUDA_DEVICE size_type max_size() const { return _N; }                           \
+    R123_CUDA_DEVICE bool empty() const { return _N==0; };                               \
+    R123_CUDA_DEVICE iterator begin() { return &v[0]; }                                  \
+    R123_CUDA_DEVICE iterator end() { return &v[_N]; }                                   \
+    R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; }                      \
+    R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; }                       \
+    R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; }                     \
+    R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; }                      \
+    R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
+    R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
+    R123_CUDA_DEVICE pointer data(){ return &v[0]; }                                     \
+    R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; }                         \
+    R123_CUDA_DEVICE reference front(){ return v[0]; }                                   \
+    R123_CUDA_DEVICE const_reference front() const{ return v[0]; }                       \
+    R123_CUDA_DEVICE reference back(){ return v[_N-1]; }                                 \
+    R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; }                     \
+    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
+	/* CUDA3 does not have std::equal */ \
+	for (size_t i = 0; i < _N; ++i) \
+	    if (v[i] != rhs.v[i]) return false; \
+	return true; \
+    } \
+    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
+    /* CUDA3 does not have std::fill_n */ \
+    R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
+    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
+	/* CUDA3 does not have std::swap_ranges */ \
+	for (size_t i = 0; i < _N; ++i) { \
+	    T tmp = v[i]; \
+	    v[i] = rhs.v[i]; \
+	    rhs.v[i] = tmp; \
+	} \
+    } \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){                         \
+        /* This test is tricky because we're trying to avoid spurious   \
+           complaints about illegal shifts, yet still be compile-time   \
+           evaulated. */                                                \
+        if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
+            return incr_carefully(n);                                   \
+        if(n==1){                                                       \
+            ++v[0];                                                     \
+            if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this;   \
+        }else{                                                          \
+            v[0] += n;                                                  \
+            if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this;  \
+        }                                                               \
+        /* We expect that the N==?? tests will be                       \
+           constant-folded/optimized away by the compiler, so only the  \
+           overflow tests (!!v[i]) remain to be done at runtime.  For  \
+           small values of N, it would be better to do this as an       \
+           uncondtional sequence of adc.  An experiment/optimization    \
+           for another day...                                           \
+           N.B.  The weird subscripting: v[_N>3?3:0] is to silence      \
+           a spurious error from icpc                                   \
+           */                                                           \
+        ++v[_N>1?1:0];                                                  \
+        if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
+        ++v[_N>2?2:0];                                                  \
+        if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this;  \
+        ++v[_N>3?3:0];                                                  \
+        for(size_t i=4; i<_N; ++i){                                     \
+            if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this;        \
+            ++v[i];                                                     \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    /* seed(SeedSeq) would be a constructor if having a constructor */  \
+    /* didn't cause headaches with defaults */                          \
+    template <typename SeedSeq>                                         \
+    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){      \
+        r123array##_N##x##W ret;                                        \
+        const size_t Ngen = _N*((3+sizeof(value_type))/4);              \
+        uint32_t u32[Ngen];                                             \
+        uint32_t *p32 = &u32[0];                                        \
+        ss.generate(&u32[0], &u32[Ngen]);                               \
+        for(size_t i=0; i<_N; ++i){                                     \
+            ret.v[i] = assemble_from_u32<value_type>(p32);              \
+            p32 += (3+sizeof(value_type))/4;                            \
+        }                                                               \
+        return ret;                                                     \
+    }                                                                   \
+protected:                                                              \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
+        /* n may be greater than the maximum value of a single value_type */ \
+        value_type vtn;                                                 \
+        vtn = n;                                                        \
+        v[0] += n;                                                      \
+        const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
+        for(size_t i=1; i<_N; ++i){                                     \
+            if(rshift){                                                 \
+                n >>= rshift;                                           \
+            }else{                                                      \
+                n=0;                                                    \
+            }                                                           \
+            if( v[i-1] < vtn )                                          \
+                ++n;                                                    \
+            if( n==0 ) break;                                           \
+            vtn = n;                                                    \
+            v[i] += n;                                                  \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    
+                                                                        
+// There are several tricky considerations for the insertion and extraction
+// operators:
+// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
+//   not as 16 bytes.
+// - we would like to be able to print r123array1xm128i.
+// - we do not want an int conversion operator in r123m128i because it causes
+//   lots of ambiguity problems with automatic promotions.
+// Solution: r123arrayinsertable and r123arrayextractable
+
+template<typename T>
+struct r123arrayinsertable{
+    const T& v;
+    r123arrayinsertable(const T& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
+        return os << t.v;
+    }
+};
+
+template<>
+struct r123arrayinsertable<uint8_t>{
+    const uint8_t& v;
+    r123arrayinsertable(const uint8_t& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
+        return os << (int)t.v;
+    }
+};
+
+template<typename T>
+struct r123arrayextractable{
+    T& v;
+    r123arrayextractable(T& t_) : v(t_) {}
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
+        return is >> t.v;
+    }
+};
+
+template<>
+struct r123arrayextractable<uint8_t>{
+    uint8_t& v;
+    r123arrayextractable(uint8_t& t_) : v(t_) {} 
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
+        int i;
+        is >>  i;
+        t.v = i;
+        return is;
+    }
+};
+
+#define CXXOVERLOADS(_N, W, T)                                          \
+                                                                        \
+inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){   \
+    os << r123arrayinsertable<T>(a.v[0]);                                  \
+    for(size_t i=1; i<_N; ++i)                                          \
+        os << " " << r123arrayinsertable<T>(a.v[i]);                       \
+    return os;                                                          \
+}                                                                       \
+                                                                        \
+inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){         \
+    for(size_t i=0; i<_N; ++i){                                         \
+        r123arrayextractable<T> x(a.v[i]);                                 \
+        is >> x;                                                        \
+    }                                                                   \
+    return is;                                                          \
+}                                                                       \
+                                                                        \
+namespace r123{                                                        \
+ typedef r123array##_N##x##W Array##_N##x##W;                          \
+}
+                                                                        
+#endif /* __cplusplus */
+
+/* _r123array_tpl expands to a declaration of struct r123arrayNxW.  
+
+   In C, it's nothing more than a struct containing an array of N
+   objects of type T.
+
+   In C++ it's the same, but endowed with an assortment of member
+   functions, typedefs and friends.  In C++, r123arrayNxW looks a lot
+   like std::array<T,N>, has most of the capabilities of a container,
+   and satisfies the requirements outlined in compat/Engine.hpp for
+   counter and key types.  ArrayNxW, in the r123 namespace is
+   a typedef equivalent to r123arrayNxW.
+*/
+
+#define _r123array_tpl(_N, W, T)                   \
+    /** @ingroup arrayNxW */                        \
+    /** @see arrayNxW */                            \
+struct r123array##_N##x##W{                         \
+ T v[_N];                                       \
+ CXXMETHODS(_N, W, T)                           \
+};                                              \
+                                                \
+CXXOVERLOADS(_N, W, T)
+
+/** @endcond */
+
+_r123array_tpl(1, 32, uint32_t)  /* r123array1x32 */
+_r123array_tpl(2, 32, uint32_t)  /* r123array2x32 */
+_r123array_tpl(4, 32, uint32_t)  /* r123array4x32 */
+_r123array_tpl(8, 32, uint32_t)  /* r123array8x32 */
+
+_r123array_tpl(1, 64, uint64_t)  /* r123array1x64 */
+_r123array_tpl(2, 64, uint64_t)  /* r123array2x64 */
+_r123array_tpl(4, 64, uint64_t)  /* r123array4x64 */
+
+_r123array_tpl(16, 8, uint8_t)  /* r123array16x8 for ARSsw, AESsw */
+
+#if R123_USE_SSE
+_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
+#endif
+
+/* In C++, it's natural to use sizeof(a::value_type), but in C it's
+   pretty convoluted to figure out the width of the value_type of an
+   r123arrayNxW:
+*/
+#define R123_W(a)   (8*sizeof(((a *)0)->v[0]))
+
+/** @namespace r123
+  Most of the Random123 C++ API is contained in the r123 namespace. 
+*/
+
+#endif
+
--- a/external/panphasia_ho/c7_script_threaded
+++ b/external/panphasia_ho/c7_script_threaded
@ -0,0 +1,33 @@
+#!/bin/bash -l
+
+#SBATCH --ntasks 5
+#SBATCH -J Test_MPI_FFTW
+#SBATCH -o standard_output_file.%J.out
+#SBATCH -e standard_error_file.%J.err
+#SBATCH -p cosma7
+#SBATCH -A dp004
+#SBATCH --exclusive
+#SBATCH -t 00:05:00
+#SBATCH --mail-type=END    # notifications for job
+#SBATCH --mail-user=a.r.jenkins@durham.ac.uk
+
+module purge
+module load intel_comp/2018 intel_mpi/2018 fftw/3.3.9cosma7 gsl/2.5 hdf5/1.8.20
+
+
+# Run the program
+
+
+
+mpirun -l -env I_MPI_PIN=1 -env I_MPI_PIN_PROCESSOR_LIST=allcores -n $SLURM_NTASKS ./pan_fftw3_test_code.x  
+
+
+
+
+
+
+
+
+
+
+
--- a/external/panphasia_ho/features/clangfeatures.h
+++ b/external/panphasia_ho/features/clangfeatures.h
@ -0,0 +1,93 @@
+/*
+Copyright 2010-2016, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __clangfeatures_dot_hpp
+#define __clangfeatures_dot_hpp
+
+#ifndef R123_USE_X86INTRIN_H
+#if (defined(__x86_64__)||defined(__i386__))
+#define R123_USE_X86INTRIN_H 1
+#else
+#define R123_USE_X86INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
+#endif
+
+// With clang-3.6, -Wall warns about unused-local-typedefs.
+// The "obvious" thing to do is to ignore -Wunused-local-typedefs,
+// but that doesn't work because earlier versions of clang blow
+// up on an 'unknown warning group'.  So we briefly ignore -Wall...
+// It's tempting to just give up on static assertions in pre-c++11 code.
+#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT)
+#define R123_STATIC_ASSERT(expr, msg) \
+_Pragma("clang diagnostic push")                      \
+_Pragma("clang diagnostic ignored \"-Wall\"")     \
+typedef char static_assertion[(!!(expr))*2-1] \
+_Pragma("clang diagnostic pop")
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
+#endif
+
+// With clang-3.0, the apparently simpler:
+//  #define R123_USE_CXX11_RANDOM __has_include(<random>)
+// dumps core.
+#ifndef R123_USE_CXX11_RANDOM
+#if __cplusplus>=201103L && __has_include(<random>)
+#define R123_USE_CXX11_RANDOM 1
+#else
+#define R123_USE_CXX11_RANDOM 0
+#endif
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#if __cplusplus>=201103L && __has_include(<type_traits>)
+#define R123_USE_CXX11_TYPE_TRAITS 1
+#else
+#define R123_USE_CXX11_TYPE_TRAITS 0
+#endif
+#endif
+
+#include "gccfeatures.h"
+
+#endif
--- a/external/panphasia_ho/features/compilerfeatures.h
+++ b/external/panphasia_ho/features/compilerfeatures.h
@ -0,0 +1,343 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/**
+
+@page porting Preprocessor symbols for porting Random123 to different platforms.
+
+The Random123 library is portable across C, C++, CUDA, OpenCL environments,
+and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
+This level of portability requires the abstraction of some features
+and idioms that are either not standardized (e.g., asm statments), or for which 
+different vendors have their own standards (e.g., SSE intrinsics) or for
+which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
+
+Random123/features/compilerfeatures.h
+conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
+defines appropriate values for the preprocessor symbols which can be used with
+a specific compiler or OS.  Those symbols will then
+be used by other header files and source files in the Random123
+library (and may be used by applications) to control what actually
+gets presented to the compiler.
+
+Most of the symbols are boolean valued.  In general, they will
+\b always be defined with value either 1 or 0, so do
+\b NOT use \#ifdef.  Use \#if R123_USE_SOMETHING instead.
+
+Library users can override any value by defining the pp-symbol with a compiler option,
+e.g.,
+
+    cc -DR123_USE_MULHILO64_C99 
+
+will use a strictly c99 version of the full-width 64x64->128-bit multiplication
+function, even if it would be disabled by default.
+
+All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
+@verbatim
+         AES_NI
+         AES_OPENSSL
+         SSE4_2
+         SSE4_1
+         SSE
+
+         STD_RANDOM
+
+         GNU_UINT128
+         ASM_GNU
+         ASM_MSASM
+
+         CPUID_MSVC
+
+         CXX11_RANDOM
+         CXX11_TYPE_TRAITS
+         CXX11_STATIC_ASSERT
+         CXX11_CONSTEXPR
+         CXX11_UNRESTRICTED_UNIONS
+         CXX11_EXPLICIT_CONVERSIONS
+         CXX11_LONG_LONG
+         CXX11_STD_ARRAY
+         CXX11 
+   
+         X86INTRIN_H
+         IA32INTRIN_H
+         XMMINTRIN_H
+         EMMINTRIN_H
+         SMMINTRIN_H
+         WMMINTRIN_H
+         INTRIN_H
+
+         MULHILO32_ASM
+         MULHILO64_ASM
+         MULHILO64_MSVC_INTRIN
+         MULHILO64_CUDA_INTRIN
+         MULHILO64_OPENCL_INTRIN
+         MULHILO64_C99
+
+         U01_DOUBLE
+	 
+@endverbatim
+Most have obvious meanings.  Some non-obvious ones:
+
+AES_NI and AES_OPENSSL are not mutually exclusive.  You can have one,
+both or neither.
+
+GNU_UINT128 says that it's safe to use __uint128_t, but it
+does not require its use.  In particular, it should be
+used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
+
+If the XXXINTRIN_H macros are true, then one should
+@code
+#include <xxxintrin.h>
+@endcode
+to gain accesss to compiler intrinsics.
+
+The CXX11_SOME_FEATURE macros allow the code to use specific
+features of the C++11 language and library.  The catchall
+In the absence of a specific CXX11_SOME_FEATURE, the feature
+is controlled by the catch-all R123_USE_CXX11 macro.
+
+U01_DOUBLE defaults on, and can be turned off (set to 0)
+if one does not want the utility functions that convert to double
+(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
+
+There are a number of invariants that are always true.  Application code may
+choose to rely on these:
+
+<ul>
+<li>ASM_GNU and ASM_MASM are mutually exclusive
+<li>The "higher" SSE values imply the lower ones.
+</ul>
+
+There are also non-boolean valued symbols:
+
+<ul>
+<li>R123_STATIC_INLINE -
+  According to both C99 and GNU99, the 'static inline' declaration allows
+  the compiler to not emit code if the function is not used.  
+  Note that the semantics of 'inline', 'static' and 'extern' in
+  gcc have changed over time and are subject to modification by
+  command line options, e.g., -std=gnu89, -fgnu-inline.
+  Nevertheless, it appears that the meaning of 'static inline' 
+  has not changed over time and (with a little luck) the use of 'static inline'
+  here will be portable between versions of gcc and to other C99
+  compilers.
+  See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
+       http://www.greenend.org.uk/rjk/2003/03/inline.html
+
+<li>R123_FORCE_INLINE(decl) -
+  which expands to 'decl', adorned with the compiler-specific
+  embellishments to strongly encourage that the declared function be
+  inlined.  If there is no such compiler-specific magic, it should
+  expand to decl, unadorned.
+   
+<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
+  sufficiently similar semantics) when CUDA is in use, and expands
+  to nothing in other cases.
+
+<li>R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or
+  something else with sufficiently similar semantics) when compiling a
+  Metal kernel, and expands to nothing in other cases.
+
+<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
+  all if we're in an environment so feature-poor that you can't even
+  call assert (I'm looking at you, CUDA and OpenCL), or even include
+  assert.h safely (OpenCL).
+
+<li>R123_STATIC_ASSERT(expr,msg) - which expands to
+  static_assert(expr,msg), or to an expression that
+  will raise a compile-time exception if expr is not true.
+
+<li>R123_ULONG_LONG - which expands to a declaration of the longest available
+  unsigned integer.
+
+<li>R123_64BIT(x) - expands to something equivalent to
+  UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
+  is not available, e.g., MSVC and OpenCL.
+
+<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
+  the semantics of gcc's __builtin_expect(expr,likely_value).  If
+  the environment has nothing like __builtin_expect, it should expand
+  to just expr.
+</ul>
+
+
+\cond HIDDEN_FROM_DOXYGEN
+*/
+
+/* 
+N.B.  When something is added to the list of features, it should be
+added to each of the *features.h files, AND to examples/ut_features.cpp.
+*/
+
+/* N.B.  most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
+#if defined(__METAL_MACOS__)
+#include "metalfeatures.h"
+#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
+#include "openclfeatures.h"
+#elif defined(__CUDACC__)
+#include "nvccfeatures.h"
+#elif defined(__ICC)
+#include "iccfeatures.h"
+#elif defined(__xlC__) || defined(__ibmxl__)
+#include "xlcfeatures.h"
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#include "sunprofeatures.h"
+#elif defined(__OPEN64__)
+#include "open64features.h"
+#elif defined(__clang__)
+#include "clangfeatures.h"
+#elif defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(__PGI)
+#include "pgccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#else
+#error "Can't identify compiler.  You'll need to add a new xxfeatures.hpp"
+{ /* maybe an unbalanced brace will terminate the compilation */
+#endif
+
+#ifndef R123_USE_CXX11
+#define R123_USE_CXX11 (__cplusplus >= 201103L)
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_LONG_LONG
+#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_STD_ARRAY
+#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_MULHILO64_C99
+#define R123_USE_MULHILO64_C99 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_STATIC_ASSERT
+#if R123_USE_CXX11_STATIC_ASSERT
+#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
+#else
+    /* if msg always_looked_like_this, we could paste it into the name.  Worth it? */
+#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
+#endif
+#endif
+
+#ifndef R123_CONSTEXPR
+#if R123_USE_CXX11_CONSTEXPR
+#define R123_CONSTEXPR constexpr
+#else
+#define R123_CONSTEXPR
+#endif
+#endif
+
+#ifndef R123_USE_64BIT
+#define R123_USE_64BIT 1
+#endif    
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN))
+#endif
+
+#ifndef R123_ULONG_LONG
+#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
+/* C++98 doesn't have long long.  It doesn't have uint64_t either, but
+   we will have typedef'ed uint64_t to something in the xxxfeatures.h.
+   With luck, it won't elicit complaints from -pedantic.  Cross your
+   fingers... */
+#define R123_ULONG_LONG uint64_t
+#else
+#define R123_ULONG_LONG unsigned long long
+#endif
+#endif
+
+/* UINT64_C should have been #defined by XXXfeatures.h, either by
+   #include <stdint.h> or through compiler-dependent hacks */
+#ifndef R123_64BIT
+#define R123_64BIT(x) UINT64_C(x)
+#endif
+
+#ifndef R123_THROW
+#define R123_THROW(x)    throw (x)
+#endif
+
+#ifndef R123_METAL_THREAD_ADDRESS_SPACE
+#define R123_METAL_THREAD_ADDRESS_SPACE
+#endif
+
+#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
+#define R123_METAL_CONSTANT_ADDRESS_SPACE
+#endif
+    
+/*
+ * Windows.h (and perhaps other "well-meaning" code define min and
+ * max, so there's a high chance that our definition of min, max
+ * methods or use of std::numeric_limits min and max will cause
+ * complaints in any program that happened to include Windows.h or
+ * suchlike first.  We use the null macro below in our own header
+ * files definition or use of min, max to defensively preclude
+ * this problem.  It may not be enough; one might need to #define
+ * NOMINMAX before including Windows.h or compile with -DNOMINMAX.
+ */
+#define R123_NO_MACRO_SUBST
+
+/** \endcond */
--- a/external/panphasia_ho/features/gccfeatures.h
+++ b/external/panphasia_ho/features/gccfeatures.h
@ -0,0 +1,263 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __gccfeatures_dot_hpp
+#define __gccfeatures_dot_hpp
+
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390x__)
+#  error "This code has only been tested on x86, powerpc and a few arm platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __powerpc__
+#include <ppu_intrinsics.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline__
+#endif
+
+#ifndef R123_FORCE_INLINE
+#if R123_GNUC_VERSION >= 40000
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#else
+#define R123_FORCE_INLINE(decl) decl
+#endif
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+/* According to the C++0x standard, we should be able to test the numeric
+   value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11
+   But gcc has had an open bug  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
+   since early 2001, which was finally fixed in 4.7 (early 2012).  For
+   earlier versions, the only way  to detect whether --std=c++0x was requested
+   on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
+*/
+#if defined(__GCC_EXPERIMENTAL_CXX0X__)
+#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
+#else
+#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_AES_NI
+#ifdef __AES__
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+   unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#if defined(__x86_64__) || defined(__aarch64__)
+#define R123_USE_GNU_UINT128 1
+#else
+#define R123_USE_GNU_UINT128 0
+#endif
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#if (defined(__x86_64__)||defined(__i386__))
+#define R123_USE_ASM_GNU 1
+#else
+#define R123_USE_ASM_GNU 1
+#endif    
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#if (defined(__x86_64__)||defined(__i386__))
+#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */  && R123_GNUC_VERSION >= 40402)
+#else
+#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */  && R123_GNUC_VERSION >= 40402)
+#endif
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have 
+   emmintrin.h in the include search path.  This is
+   so broken that I refuse to try to work around it.  If this
+   affects you, figure out where your emmintrin.h lives and
+   add an appropriate -I to your CPPFLAGS.  Or add -DR123_USE_SSE=0. */
+#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#if (defined(__powerpc64__))
+#define R123_USE_MULHILO64_MULHI_INTRIN 1
+#else
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
--- a/external/panphasia_ho/features/iccfeatures.h
+++ b/external/panphasia_ho/features/iccfeatures.h
@ -0,0 +1,212 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __icpcfeatures_dot_hpp
+#define __icpcfeatures_dot_hpp
+
+// icc relies on gcc libraries and other toolchain components.
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__)
+#  error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Easy123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_NI
+// Unlike gcc, icc (version 12) does not pre-define an __AES__
+// pp-symbol when -maes or -xHost is on the command line.  This feels
+// like a defect in icc (it defines __SSE4_2__ in analogous
+// circumstances), but until Intel fixes it, we're better off erring
+// on the side of caution and not generating instructions that are
+// going to raise SIGILL when executed.  To get the AES-NI
+// instructions with icc, the caller must puts something like
+// -DR123_USE_AES_NI=1 or -D__AES__ on the command line.  FWIW, the
+// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
+// 11.1 onwards.
+//
+#if defined(__AES__)
+#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/)
+#else
+#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/)
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 1
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- a/external/panphasia_ho/features/metalfeatures.h
+++ b/external/panphasia_ho/features/metalfeatures.h
@ -0,0 +1,111 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * Written by Tom Schoonjans <Tom.Schoonjans@me.com>
+ */
+
+#ifndef __metalfeatures_dot_hpp
+#define __metalfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_METAL_THREAD_ADDRESS_SPACE
+#define R123_METAL_THREAD_ADDRESS_SPACE thread
+#endif
+
+#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
+#define R123_METAL_CONSTANT_ADDRESS_SPACE constant
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 1
+#endif
+
+#if R123_USE_MULHILO32_MULHI_INTRIN
+#include <metal_integer>
+#define R123_MULHILO32_MULHI_INTRIN metal::mulhi
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_64BIT
+#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */
+#endif
+
+#ifndef R123_ULONG_LONG
+/* the longest integer type in Metal (Feb 2019, Specification-2) is a
+ * 32-bit unsigned int.  Let's hope for the best... */
+#define R123_ULONG_LONG unsigned int 
+#endif
+
+#endif
--- a/external/panphasia_ho/features/msvcfeatures.h
+++ b/external/panphasia_ho/features/msvcfeatures.h
@ -0,0 +1,200 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __msvcfeatures_dot_hpp
+#define __msvcfeatures_dot_hpp
+
+//#if _MSVC_FULL_VER <= 15
+//#error "We've only tested MSVC_FULL_VER==15."
+//#endif
+
+#if !defined(_M_IX86) && !defined(_M_X64)
+#  error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Random123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) _forceinline decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#if defined(_M_X64)
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#if defined(_M_X64)
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#if defined(_M_X64)
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 1
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 1
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 1
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#if defined(_M_X64)
+#define R123_USE_MULHILO64_MSVC_INTRIN 1
+#else
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+#pragma warning(disable:4244)
+#pragma warning(disable:4996)
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- a/external/panphasia_ho/features/nvccfeatures.h
+++ b/external/panphasia_ho/features/nvccfeatures.h
@ -0,0 +1,125 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __r123_nvcc_features_dot_h__
+#define __r123_nvcc_features_dot_h__
+
+#if !defined(CUDART_VERSION)
+#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
+#endif
+
+#if CUDART_VERSION < 4010
+#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces.  Random123 isunsupported.  See comments in nvccfeatures.h"
+// This test was added in Random123-1.08 (August, 2013) because we
+// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
+// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
+// earlier.  We can't be sure this bug doesn't also affect invocations
+// of other templated functions, e.g., essentially all of Random123.
+// Thus, we no longer trust CUDA versions earlier than 4.1 even though
+// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
+// If you feel lucky or desperate, you can change #error to #warning, but
+// please take extra care to be sure that you are getting correct
+// results.
+#endif
+
+// nvcc falls through to gcc or msvc.  So first define
+// a couple of things and then include either gccfeatures.h
+// or msvcfeatures.h
+
+//#ifdef  __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
+//for both device and host functions in CUDA by setting compiler flags
+//for the device function
+#ifdef  __CUDA_ARCH__
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE __device__
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 1
+#endif
+
+#ifndef R123_THROW
+// No exceptions in CUDA, at least upto 4.0
+#define R123_THROW(x)    R123_ASSERT(0)
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x) if((x)) ; else asm("trap;")
+#endif
+
+#else // ! __CUDA_ARCH__
+// If we're using nvcc not compiling for the CUDA architecture,
+// then we must be compiling for the host.  In that case,
+// tell the philox code to use the mulhilo64 asm because
+// nvcc doesn't grok uint128_t.
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#endif // __CUDA_ARCH__
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_ULONG_LONG
+// uint64_t, which is what we'd get without this, is
+// not the same as unsigned long long
+#define R123_ULONG_LONG unsigned long long
+#endif
+
+#if defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#endif
+
+#endif
--- a/external/panphasia_ho/features/open64features.h
+++ b/external/panphasia_ho/features/open64features.h
@ -0,0 +1,50 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __open64features_dot_hpp
+#define __open64features_dot_hpp
+
+/* The gcc features are mostly right.  We just override a few and then include gccfeatures.h */
+
+/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
+   but produce incorrect code for 64-bit philox.  The MULHILO64_ASM
+   seems to work fine */
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#include "gccfeatures.h"
+
+#endif
--- a/external/panphasia_ho/features/openclfeatures.h
+++ b/external/panphasia_ho/features/openclfeatures.h
@ -0,0 +1,89 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __openclfeatures_dot_hpp
+#define __openclfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 1
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
+// ulong to mul_hi.  And gets lots of complaints from stdint.h
+// on some machines.
+// But these typedefs mean we cannot include stdint.h with
+// these headers?  Do we need R123_64T, R123_32T, R123_8T?
+typedef ulong uint64_t;
+typedef uint  uint32_t;
+typedef uchar uint8_t;
+#define UINT64_C(x) ((ulong)(x##UL))
+
+#endif
--- a/external/panphasia_ho/features/pgccfeatures.h
+++ b/external/panphasia_ho/features/pgccfeatures.h
@ -0,0 +1,194 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __pgccfeatures_dot_hpp
+#define __pgccfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__)
+#  error "This code has only been tested on x86 platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+/* Found this example in PGI's emmintrin.h. */
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) (expr)
+#endif
+
+/* PGI through 13.2 doesn't appear to support AES-NI. */
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
+   ABM, but not SSE4.1 or SSE4.2. */
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+   unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
+   about undefined references to _mm_castsi128_ps(__m128i).  Why? */
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
--- a/external/panphasia_ho/features/sse.h
+++ b/external/panphasia_ho/features/sse.h
@ -0,0 +1,280 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _Random123_sse_dot_h__
+#define _Random123_sse_dot_h__
+
+#if R123_USE_SSE
+
+#if R123_USE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+#if R123_USE_IA32INTRIN_H
+#include <ia32intrin.h>
+#endif
+#if R123_USE_XMMINTRIN_H
+#include <xmmintrin.h>
+#endif
+#if R123_USE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
+#if R123_USE_SMMINTRIN_H
+#include <smmintrin.h>
+#endif
+#if R123_USE_WMMINTRIN_H
+#include <wmmintrin.h>
+#endif
+#if R123_USE_INTRIN_H
+#include <intrin.h>
+#endif
+#ifdef __cplusplus
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+#endif
+
+#if R123_USE_ASM_GNU
+
+/* bit25 of CX tells us whether AES is enabled. */
+R123_STATIC_INLINE int haveAESNI(){
+    unsigned int eax, ebx, ecx, edx;
+    __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
+                      "a" (1));
+    return (ecx>>25) & 1;
+}
+#elif R123_USE_CPUID_MSVC
+R123_STATIC_INLINE int haveAESNI(){
+    int CPUInfo[4];
+    __cpuid(CPUInfo, 1);
+    return (CPUInfo[2]>>25)&1;
+}
+#else /* R123_USE_CPUID_??? */
+#warning "No R123_USE_CPUID_XXX method chosen.  haveAESNI will always return false"
+R123_STATIC_INLINE int haveAESNI(){
+    return 0;
+}
+#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
+
+// There is a lot of annoying and inexplicable variation in the
+// SSE intrinsics available in different compilation environments.
+// The details seem to depend on the compiler, the version and
+// the target architecture.  Rather than insisting on
+// R123_USE_feature tests for each of these in each of the
+// compilerfeatures.h files we just keep the complexity localized
+// to here...
+#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
+/* Is there an intrinsic to assemble an __m128i from two 64-bit words? 
+   If not, use the 4x32-bit intrisic instead.  N.B.  It looks like Intel
+   added _mm_set_epi64x to icc version 12.1 in Jan 2012.
+*/
+R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
+    union{
+        uint64_t u64;
+        uint32_t u32[2];
+    } u1, u0;
+    u1.u64 = v1;
+    u0.u64 = v0;
+    return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
+}
+#endif
+/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
+   word from an __m128i.  The _mm_cvtsi128_si64 intrinsic does the job
+   on 64-bit platforms.  Unfortunately, both MSVC and Open64 fail
+   assertions in ut_M128.cpp and ut_carray.cpp when we use the
+   _mm_cvtsi128_si64 intrinsic.  (See
+   https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
+   On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
+   Finally, even if the intrinsic exists, it may be spelled with or
+   without the 'x'.
+*/
+#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    union{
+        uint64_t u64[2];
+        __m128i m;
+    }u;
+    _mm_store_si128(&u.m, si);
+    return u.u64[0];
+}
+#elif defined(__llvm__) || defined(__ICC)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    return (uint64_t)_mm_cvtsi128_si64(si);
+}
+#else /* GNUC, others */
+/* FWIW, gcc's emmintrin.h has had the 'x' spelling
+   since at least gcc-3.4.4.  The no-'x' spelling showed up
+   around 4.2. */
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    return (uint64_t)_mm_cvtsi128_si64x(si);
+}
+#endif
+#if defined(__GNUC__) && __GNUC__ < 4
+/* the cast builtins showed up in gcc4. */
+R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
+    return (__m128)si;
+}
+#endif
+
+#ifdef __cplusplus
+
+struct r123m128i{
+    __m128i m;
+#if R123_USE_CXX11_UNRESTRICTED_UNIONS
+    // C++98 forbids a union member from having *any* constructors.
+    // C++11 relaxes this, and allows union members to have constructors
+    // as long as there is a "trivial" default construtor.  So in C++11
+    // we can provide a r123m128i constructor with an __m128i argument, and still
+    // have the default (and hence trivial) default constructor.
+    r123m128i() = default;
+    r123m128i(__m128i _m): m(_m){}
+#endif
+    r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
+    r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
+#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
+    // With C++11 we can attach explicit to the bool conversion operator
+    // to disambiguate undesired promotions.  For g++, this works
+    // only in 4.5 and above.
+    explicit operator bool() const {return _bool();}
+#else
+    // Pre-C++11, we have to do something else.  Google for the "safe bool"
+    // idiom for other ideas...
+    operator const void*() const{return _bool()?this:0;}
+#endif
+    operator __m128i() const {return m;}
+
+private:
+#if R123_USE_SSE4_1
+    bool _bool() const{ return !_mm_testz_si128(m,m); }
+#else
+    bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
+#endif
+};
+
+R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
+    __m128i& c = v.m;
+    __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
+    c = _mm_add_epi64(c, zeroone);
+    //return c;
+#if R123_USE_SSE4_1
+    __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
+    if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
+        __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
+        c = _mm_add_epi64(c, onezero);
+    }
+#else
+    unsigned mask  = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
+    // The low two bits of mask are 11 iff the low 64 bits of
+    // c are zero.
+    if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
+        __m128i onezero = _mm_set_epi64x(1,0);
+        c = _mm_add_epi64(c, onezero);
+    }
+#endif
+    return v;
+}
+
+R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ 
+    __m128i c = lhs.m;
+    __m128i incr128 = _mm_set_epi64x(0, n);
+    c = _mm_add_epi64(c, incr128);
+    // return c;     // NO CARRY!  
+
+    int64_t lo64 = _mm_extract_lo64(c);
+    if((uint64_t)lo64 < n)
+        c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
+    lhs.m = c;
+    return lhs; 
+}
+
+// We need this one because it's present, but never used in r123array1xm128i::incr
+R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
+    throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
+
+// The comparisons aren't implemented, but if we leave them out, and 
+// somebody writes, e.g., M1 < M2, the compiler will do an implicit
+// conversion through void*.  Sigh...
+R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
+
+R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ 
+    return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
+R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ 
+    return !(lhs==rhs);}
+R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
+    r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
+R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
+    return !(lhs==rhs);}
+R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
+    union{
+        uint64_t u64[2];
+        __m128i m;
+    }u;
+    _mm_storeu_si128(&u.m, m.m);
+    return os << u.u64[0] << " " << u.u64[1];
+}
+
+R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
+    uint64_t u64[2];
+    is >> u64[0] >> u64[1];
+    m.m = _mm_set_epi64x(u64[1], u64[0]);
+    return is;
+}
+
+template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
+
+template <>
+inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
+    r123m128i ret;
+    ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
+    return ret;
+}
+
+#else
+
+typedef struct {
+    __m128i m;
+} r123m128i;
+
+#endif /* __cplusplus */
+
+#else /* !R123_USE_SSE */
+R123_STATIC_INLINE int haveAESNI(){
+    return 0;
+}
+#endif /* R123_USE_SSE */
+
+#endif /* _Random123_sse_dot_h__ */
--- a/external/panphasia_ho/features/sunprofeatures.h
+++ b/external/panphasia_ho/features/sunprofeatures.h
@ -0,0 +1,172 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __sunprofeatures_dot_hpp
+#define __sunprofeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- a/external/panphasia_ho/features/xlcfeatures.h
+++ b/external/panphasia_ho/features/xlcfeatures.h
@ -0,0 +1,210 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __xlcfeatures_dot_hpp
+#define __xlcfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
+#  error "This code has only been tested on x86 and PowerPC platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __cplusplus
+/* builtins are automatically available to xlc.  To use them with xlc++,
+   one must include builtins.h.   c.f
+   http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
+*/
+#include <builtins.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#if (defined(__powerpc64__))
+#define R123_USE_MULHILO64_MULHI_INTRIN 1
+#else
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#if defined(__powerpc64__)
+#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
+#else
+#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
--- a/external/panphasia_ho/high_order_panphasia_routines.c
+++ b/external/panphasia_ho/high_order_panphasia_routines.c
--- a/external/panphasia_ho/main.c
+++ b/external/panphasia_ho/main.c
@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+#include <complex.h>
+#include <fftw3-mpi.h>
+
+#include "PAN_FFTW3.h"
+#include "panphasia_functions.h"
+
+extern size_t descriptor_base_size;
+
+#ifdef USE_OPENMP
+#include <omp.h>
+int threads_ok;
+int number_omp_threads = 1;
+#endif
+
+
+#if 0 // this is now unused since all this has been migrated to plugin!
+// does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC)
+int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load)
+{
+   int verbose = 0;
+   int error;
+   size_t x0 = 0, y0 = 0, z0 = 0;
+   size_t rel_level;
+   int fdim=1;   //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
+
+   //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
+
+   PANPHASIA_init_descriptor_(descriptor, &verbose);
+
+   printf("Descriptor %s\n ngrid_load %llu\n",descriptor,*ngrid_load);
+
+   // Choose smallest value of level to equal of exceed *ngrid_load)
+
+   for (rel_level=0; fdim*(descriptor_base_size<<(rel_level+1))<=*ngrid_load; rel_level++);
+
+   printf("Setting relative level = %llu\n",rel_level);
+
+
+   if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
+   {
+      printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
+      abort();
+   };
+
+   //======================= FFTW ==============================
+
+   ptrdiff_t alloc_local, local_n0, local_0_start;
+
+   ptrdiff_t N0 = fdim*(descriptor_base_size << rel_level);
+
+   alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 +2 , MPI_COMM_WORLD, &local_n0, &local_0_start);
+
+   FFTW_COMPLEX *Panphasia_White_Noise_Field;
+
+   Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
+
+   if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
+   {
+      printf("Error code from PANPHASIA_compute ...  %d\n", error);
+   };
+
+   fftw_free(Panphasia_White_Noise_Field);
+
+   return(0);
+}
+#endif
+
+#ifdef STANDALONE_PANPHASIA_HO
+int main(int argc, char **argv)
+{
+
+   int verbose = 0;
+   int error;
+   size_t x0 = 0, y0 = 0, z0 = 0;
+   size_t rel_level;
+   char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
+
+#ifdef USE_OPENMP
+   omp_set_num_threads(number_omp_threads);
+   int provided;
+   MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+   threads_ok = provided >= MPI_THREAD_FUNNELED;
+   if (threads_ok)
+      threads_ok = fftw_init_threads();
+   fftw_mpi_init();
+   int num_threads = number_omp_threads;
+   if (threads_ok)
+   {
+      fftw_plan_with_nthreads(num_threads);
+   }
+   else
+   {
+      printf("Failure to initialise threads ...\n");
+      MPI_Finalize();
+   };
+
+   printf("OpenMP threads enabled with FFTW. Number of threads %d\n", fftw_planner_nthreads());
+#else
+   MPI_Init(&argc, &argv);
+#endif
+
+   PANPHASIA_init_descriptor_(descriptor, &verbose);
+
+   rel_level = 6; //Set size of test dataset
+
+   if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
+   {
+      printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
+      abort();
+   };
+
+   //======================= FFTW ==============================
+
+   fftw_mpi_init();
+
+   ptrdiff_t alloc_local, local_n0, local_0_start;
+
+   ptrdiff_t N0 = descriptor_base_size << rel_level;
+
+   alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0+2, MPI_COMM_WORLD, &local_n0, &local_0_start);
+
+   FFTW_COMPLEX *Panphasia_White_Noise_Field;
+
+   Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
+
+   if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
+   {
+      printf("Error code from PANPHASIA_compute ...  %d\n", error);
+   };
+
+   fftw_free(Panphasia_White_Noise_Field);
+
+   fftw_mpi_cleanup();
+   //==================== End FFTW  ===========================
+
+   MPI_Finalize();
+   return(0);
+}
+
+
+
+#endif // STANDALONE_PANPHASIA_HO
--- a/external/panphasia_ho/makefile
+++ b/external/panphasia_ho/makefile
@ -0,0 +1,22 @@
+CC = mpicc -qopenmp 
+G99 = gcc -fopenmp
+
+GSL_LIBS =  -lgsl -lgslcblas 
+
+#CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3  -qopt-zmm-usage=high -vec-threshold0 -lfftw3  -lfftw3f   -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi
+CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3  -qopt-zmm-usage=high -vec-threshold0 -lfftw3  -lfftw3f   -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi -DUSE_OPENMP 
+
+
+
+pan_fftw3_test_code.x: main.o high_order_panphasia_routines.o  pan_mpi_routines.o uniform_rand_threefry4x64.o 
+	$(CC) $(CCFLAGS) *.o  -o pan_fftw3_test_code.x
+main.o:  main.c
+	$(CC) $(CCFLAGS) -c main.c
+high_order_panphasia_routines.o:  high_order_panphasia_routines.c
+	$(CC) $(CCFLAGS) -c high_order_panphasia_routines.c
+pan_mpi_routines.o:  pan_mpi_routines.c
+	$(CC) $(CCFLAGS) -c pan_mpi_routines.c
+uniform_rand_threefry4x64.o: uniform_rand_threefry4x64.c
+	$(CC) $(CCFLAGS) -c uniform_rand_threefry4x64.c
+clean:
+	rm *.o
--- a/external/panphasia_ho/pan_matrices_order6.h
+++ b/external/panphasia_ho/pan_matrices_order6.h
--- a/external/panphasia_ho/pan_mpi_routines.c
+++ b/external/panphasia_ho/pan_mpi_routines.c
@ -0,0 +1,533 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+#include <complex.h>
+#include <fftw3-mpi.h>
+
+#include "PAN_FFTW3.h"
+#include "panphasia_functions.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+
+extern const int Nbasis;
+extern const int irank_p[3][84];
+
+extern size_t descriptor_order;
+extern size_t descriptor_kk_limit;
+extern size_t descriptor_base_size;
+
+int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_fourier_grid,
+                                    ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return,
+                                    FFTW_COMPLEX *return_field)
+{
+
+  size_t copy_list[Nbasis];
+
+  int pmax = 6;
+
+  int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2;
+
+  size_t ncopy = (pmax + 1) * (pmax + 2) * (pmax + 3) / 6;
+
+  if (ncopy % nsubdivide != 0)
+    return (100010);
+  int nchunk = ncopy / nsubdivide;
+
+  int verbose = 1;
+  int flag_output_mode = 2;
+  int error;
+  ptrdiff_t size_to_alloc_fourier;
+  ptrdiff_t size_to_alloc_pan;
+  ptrdiff_t local_fourier_x_start, local_fourier_x_end;
+  FFTW_PLAN output_coeff_forward_plan;
+
+  ptrdiff_t N0_pan_grid = descriptor_base_size << relative_level;
+
+  if (N0_fourier_grid % N0_pan_grid != 0)
+    return (100015);
+
+  int fdim = N0_fourier_grid / N0_pan_grid;
+  size_t nfft_dim = N0_fourier_grid;
+  size_t npan_dim = N0_pan_grid;
+
+  int SHARED_FOUR_PAN_SPACE = (nsubdivide == 1) && (fdim == 1) && (sizeof(PAN_REAL) == sizeof(FFTW_REAL));
+
+  ////////////////////////////////////////////////////////////////////////////////////
+
+  if (pmax > descriptor_order)
+    return (100020);
+
+  for (size_t i = 0; i < Nbasis; i++)
+    copy_list[i] = i;
+
+  //printf("Dimensions of FT   (%td,%td,%td)\n",N0_fourier_grid,N0_fourier_grid,N0_fourier_grid);
+  //printf("Dimensions of PG   (%td,%td,%td)\n",N0_pan_grid,N0_pan_grid,N0_pan_grid);
+  //printf("local_no %td local_0_start_fourier %td\n",local_n0_fourier_return, local_0_start_fourier_return);
+
+  //  Compute 1-D Spherical Bessel coefficients for each order //////////////////
+  //  These are needed for the convolutions below              //////////////////
+  size_t n4dimen;
+
+  n4dimen = (nfft_dim % 4 == 0) ? 4 * (nfft_dim / 4) + 4 : 4 * (nfft_dim / 4) + 5;
+
+  double complex *sph_bessel_coeff = FFTW_MALLOC(sizeof(double complex) * n4dimen * (pmax + 1));
+
+  if (sph_bessel_coeff == NULL)
+    return (100030);
+
+  compute_sph_bessel_coeffs(nfft_dim, pmax, n4dimen, fdim, sph_bessel_coeff);
+
+  //printf("Reached here! ndimen4 %ld\n",n4dimen);
+  ///////////////////////////////////////////////////////////////////////////////
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Determine sizes of Fourier and Panphasia coefficient 3-D arrays //
+
+  ptrdiff_t local_n0_pan;
+  ptrdiff_t local_0_start_pan;
+  {
+
+    int rank = 3;
+
+    ptrdiff_t local_n0_fourier;
+    ptrdiff_t local_0_start_fourier;
+    const ptrdiff_t ndimens_alloc_fourier[] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid + 2}; // Allocated for r2c
+    ptrdiff_t howmany = nchunk;
+    size_to_alloc_fourier = FFTW_MPI_LOCAL_SIZE_MANY(rank, ndimens_alloc_fourier, howmany,
+                                                     FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD,
+                                                     &local_n0_fourier, &local_0_start_fourier);
+
+    if (local_0_start_fourier != local_0_start_fourier_return)
+    {
+      printf("Error local_0_start_fourier!=local_0_start_fourier_return\n");
+      return (100032);
+    };
+
+    if (local_n0_fourier != local_n0_fourier_return)
+    {
+      printf("Error local_n0_fourier!=local_n0_fourier_return\n");
+      return (100033);
+    };
+
+    ptrdiff_t abs_fourier_x_start, abs_fourier_x_end;
+
+    if (local_0_start_fourier_return % fdim == 0)
+    {
+      abs_fourier_x_start = local_0_start_fourier_return;
+    }
+    else
+    {
+      abs_fourier_x_start = local_0_start_fourier_return + fdim - (local_0_start_fourier_return % fdim);
+    };
+
+    if ((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim == 0)
+    {
+      abs_fourier_x_end = local_0_start_fourier_return + local_n0_fourier_return - 1;
+    }
+    else
+    {
+      abs_fourier_x_end = (local_0_start_fourier_return + local_n0_fourier_return - 1) -
+                          ((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim);
+    };
+
+    if ((abs_fourier_x_end - abs_fourier_x_start) % fdim != 0)
+      return (100036);
+
+    local_0_start_pan = abs_fourier_x_start / fdim;
+    local_n0_pan = 1 + (abs_fourier_x_end - abs_fourier_x_start) / fdim;
+
+    local_fourier_x_start = abs_fourier_x_start - local_0_start_fourier_return;
+    local_fourier_x_end = abs_fourier_x_end - local_0_start_fourier_return;
+
+    // const ptrdiff_t ndimens_alloc_pan[] = {N0_pan_grid, N0_pan_grid, N0_pan_grid+2};
+    howmany = ncopy;
+
+    size_to_alloc_pan = howmany * local_n0_pan * N0_pan_grid * (N0_pan_grid + 2);
+
+    //printf("size_to_alloc_fourier = %td\n",size_to_alloc_fourier);
+    //printf("size_to_alloc_pan     = %td\n",size_to_alloc_pan);
+  };
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Allocate arrays to store Panphasia coefficients and Fourier information
+  // If nsubdivide==1 then use the same structure to store both.
+
+  void *panphasia_coefficients = FFTW_MALLOC(sizeof(PAN_REAL) * size_to_alloc_pan);
+  void *fourier_grids;
+
+  if (panphasia_coefficients == NULL)
+    return (100040);
+
+  void *mode_weightings = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
+
+  FFTW_REAL *ptr_mode_weightings;
+  ptr_mode_weightings = mode_weightings;
+  memset(ptr_mode_weightings, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
+
+  FFTW_REAL *ptr_real_fourier_grid;
+  FFTW_REAL *ptr_panphasia_coefficients = panphasia_coefficients;
+
+  FFTW_COMPLEX *ptr_cmplx_fourier_grid;
+
+  if (SHARED_FOUR_PAN_SPACE)
+  {
+    ptr_real_fourier_grid = panphasia_coefficients;
+    ptr_cmplx_fourier_grid = panphasia_coefficients;
+  }
+  else
+  {
+
+    fourier_grids = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier);
+
+    if (fourier_grids == NULL)
+      return (100041);
+
+    ptr_real_fourier_grid = fourier_grids;
+    ptr_cmplx_fourier_grid = fourier_grids;
+  };
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  //////////// Compute the Panphasia coefficients   ////////////////////////////////
+
+  {
+
+    size_t xorigin = local_0_start_pan, yorigin = 0, zorigin = 0;
+
+    size_t xextent = local_n0_pan, yextent = N0_pan_grid, zextent = N0_pan_grid;
+
+    if ((error = PANPHASIA_compute_coefficients_(&xorigin, &yorigin, &zorigin, &xextent, &yextent,
+                                                 &zextent, copy_list, &ncopy, panphasia_coefficients,
+                                                 &flag_output_mode, &verbose)))
+      return (100100 + error);
+  };
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  //////////  Output diagnostics for small grids only
+
+  {
+
+    if (N0_pan_grid < -1)
+    {
+
+      int rank;
+      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+      char filename[100];
+      sprintf(filename, "output_real_space_field.%d", rank);
+
+      FILE *fp;
+      printf("local_n0_pan %ld npan_dim %ld N0_pan_grid %ld\n", local_n0_pan,
+             npan_dim, N0_pan_grid);
+
+      fp = fopen(filename, "w");
+
+      for (int ix = 0; ix < local_n0_pan; ix++)
+        for (int iy = 0; iy < npan_dim; iy++)
+          for (int iz = 0; iz < npan_dim; iz++)
+          {
+            int index = ix * N0_pan_grid * (N0_pan_grid + 2) + iy * (N0_pan_grid + 2) + iz;
+            fprintf(fp, "%6lu%6d%6d %14.8lf %d\n", ix + local_0_start_pan, iy, iz,
+                    ptr_panphasia_coefficients[index], index);
+          };
+
+      fclose(fp);
+    };
+  };
+
+  //----------------------------------------------------------------------------------
+  ////////////// Set up FTTW plan //////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////
+
+  // printf("Making the plan ... \n");
+
+  //////////////////// Make plan for ncopy interleaved FTs ///////////////////////////
+  //////////////////////////////////////////////////////////////////////////
+
+  {
+    int rank = 3;
+    const ptrdiff_t ndimens[3] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid};
+    ptrdiff_t howmany = nchunk;
+    ptrdiff_t block = FFTW_MPI_DEFAULT_BLOCK;
+    ptrdiff_t tblock = FFTW_MPI_DEFAULT_BLOCK;
+    unsigned flags = FFTW_ESTIMATE;
+
+    output_coeff_forward_plan = FFTW_MPI_PLAN_MANY_DTF_R2C(rank, ndimens,
+                                                           howmany, block, tblock,
+                                                           ptr_real_fourier_grid, ptr_cmplx_fourier_grid,
+                                                           MPI_COMM_WORLD, flags);
+    if (output_coeff_forward_plan == NULL)
+    {
+      printf("Null plan\n");
+      return (100051);
+    };
+  };
+
+  //printf("Plan completed ... \n");
+
+  //////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////
+  //----------------------------------------------------------------------------------
+
+  memset(return_field, 0, local_n0_fourier_return * N0_fourier_grid * (N0_fourier_grid + 2) * sizeof(FFTW_REAL));
+
+  for (int iter = 0; iter < nsubdivide; iter++)
+  {
+
+    int moffset = iter * nchunk;
+
+    if (!SHARED_FOUR_PAN_SPACE)
+    {
+
+      memset(ptr_real_fourier_grid, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier);
+
+      // Copy Panphasia coefficients to Fourier grid with appropriate stride - fdim
+
+      size_t index_p, index_f;
+      int m;
+
+      for (int ix_f = local_fourier_x_start, ix_p = 0; ix_f <= local_fourier_x_end; ix_f += fdim, ix_p++)
+#ifdef USE_OPENMP
+#pragma omp parallel for collapse(2) private(index_p, index_f, m)
+#endif
+
+        for (int iy_f = 0, iy_p = 0; iy_f < nfft_dim; iy_f += fdim, iy_p++)
+          for (int iz_f = 0, iz_p = 0; iz_f < nfft_dim; iz_f += fdim, iz_p++)
+          {
+            index_p = ix_p * N0_pan_grid * (N0_pan_grid + 2) + iy_p * (N0_pan_grid + 2) + iz_p;
+            index_f = ix_f * N0_fourier_grid * (N0_fourier_grid + 2) + iy_f * (N0_fourier_grid + 2) + iz_f;
+
+            for (m = 0; m < nchunk; m++)
+            {
+              ptr_real_fourier_grid[nchunk * index_f + m] =
+                  ptr_panphasia_coefficients[ncopy * index_p + moffset + m];
+            };
+          };
+    };
+
+    FFTW_MPI_EXECUTE_DFT_R2C(output_coeff_forward_plan,
+                             ptr_real_fourier_grid, ptr_cmplx_fourier_grid);
+
+    {
+      // Convolve and combine the FT of the Panphasia coefficient field
+
+      size_t index1, index2;
+      complex weight;
+      //int m;
+
+#ifdef USE_OPENMP
+#pragma omp parallel for collapse(3) private(index1, index2, weight, m)
+#endif
+      for (int ix = 0; ix < local_n0_fourier_return; ix++)
+        for (int iy = 0; iy < nfft_dim; iy++)
+          for (int iz = 0; iz <= nfft_dim / 2; iz++)
+          {
+            index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
+            for (int m = 0; m < nchunk; m++)
+            {
+              index2 = nchunk * index1 + m;
+
+              weight = sph_bessel_coeff[n4dimen * irank_p[0][m + moffset] + ix + local_0_start_fourier_return] *
+                       sph_bessel_coeff[n4dimen * irank_p[1][m + moffset] + iy] *
+                       sph_bessel_coeff[n4dimen * irank_p[2][m + moffset] + iz];
+              return_field[index1] += weight * ptr_cmplx_fourier_grid[index2];
+              ptr_mode_weightings[index1] += cabs(weight) * cabs(weight);
+            };
+          };
+    };
+
+  }; // End loop over iter
+
+  // printf("Reached here 10!\n");
+
+  //Add phase shift and normalise field
+  {
+
+    double complex phase_shift_and_scale;
+    int kx, ky, kz;
+    const double pi = 4.0 * atan(1.0);
+    size_t index1;
+
+    double min_weight = 100.0;
+
+#ifdef USE_OPENMP
+#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, phase_shift_and_scale)
+#endif
+    for (int ix = 0; ix < local_n0_fourier_return; ix++)
+      for (int iy = 0; iy < nfft_dim; iy++)
+        for (int iz = 0; iz <= nfft_dim / 2; iz++)
+        {
+          index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
+          kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
+          ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
+          kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
+
+          if ((kx == nfft_dim / 2) || (ky == nfft_dim / 2) || (kz == nfft_dim / 2))
+          {
+            // Set Nyquist modes to zero - not used by IC_Gen anyway.
+            phase_shift_and_scale = 0.0;       //1.0/pow((double)nfft_dim,1.5);  // No phase shift
+            ptr_mode_weightings[index1] = 0.0; // Set squared weight to zero
+          }
+          else
+          {
+            phase_shift_and_scale = sqrt((double)(fdim * fdim * fdim)) *
+                                    cexp((double)fdim * (-I) * pi * (double)(kx + ky + kz) / sqrt(ptr_mode_weightings[index1]) /
+                                         (double)nfft_dim) /
+                                    pow((double)nfft_dim, 1.5);
+          };
+
+          return_field[index1] *= phase_shift_and_scale;
+          if (ptr_mode_weightings[index1] < min_weight)
+            min_weight = ptr_mode_weightings[index1];
+        };
+
+    //  printf("Minimum weight %lf\n",sqrt(min_weight));
+  };
+
+  // printf("Reached here 11!\n");
+
+  // Rescale selected Fourier modes to unit amplitude.
+  // By default this part is not executed.
+
+  if (descriptor_kk_limit > 0)
+  {
+    size_t index1;
+    complex weight;
+    size_t ksquared;
+    int kx, ky, kz;
+#ifdef USE_OPENMP
+#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, ksquared, weight)
+#endif
+    for (int ix = 0; ix < local_n0_fourier_return; ix++)
+      for (int iy = 0; iy < nfft_dim; iy++)
+        for (int iz = 0; iz <= nfft_dim / 2; iz++)
+        {
+          kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
+          ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
+          kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
+          ksquared = kx * kx + ky * ky + kz * kz;
+          if ((kx != nfft_dim / 2) && (ky != nfft_dim / 2) && (kz != nfft_dim / 2))
+          { //Omit Nyquist modes
+
+            if ((ksquared <= descriptor_kk_limit) && (ksquared != 0))
+            {
+              index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
+              weight = cabs(return_field[index1]);
+              return_field[index1] /= weight;
+            };
+          };
+        };
+  };
+
+  //printf("Reached here 12!\n");
+
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  char filename[100];
+  sprintf(filename, "output_k_space_alt.%d", rank);
+  FILE *fp;
+
+  if (nfft_dim < -1)
+  {
+
+    FILE *fp;
+
+    fp = fopen(filename, "w");
+
+    for (int ix = 0; ix < local_n0_fourier_return; ix++)
+      for (int iy = 0; iy < nfft_dim; iy++)
+        for (int iz = 0; iz <= nfft_dim / 2; iz++)
+        {
+
+          int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
+          fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
+                  creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
+          //		ptr_mode_weightings[index]);
+        };
+    fclose(fp);
+  }
+  else
+  {
+
+    fp = fopen(filename, "w");
+
+    for (int ix = 0; ix < local_n0_fourier_return; ix++)
+      for (int iy = 0; iy < nfft_dim; iy++)
+        for (int iz = 0; iz <= nfft_dim / 2; iz++)
+        {
+          if (ix + iy + iz + local_0_start_fourier_return < 100)
+          {
+            int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
+            fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
+                    creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
+            //   ptr_mode_weightings[index]);
+          };
+        };
+    fclose(fp);
+  };
+
+  // Transpose output field
+
+  {
+
+    FFTW_PLAN transpose_output_plan;
+    unsigned flags = FFTW_ESTIMATE;
+
+    void *ptr_inter = return_field;
+
+    FFTW_REAL *ptr_return_as_real_field;
+    ptr_return_as_real_field = ptr_inter;
+
+    // int rank = 2;
+
+    //  ptrdiff_t size_to_transpose;
+    ptrdiff_t howmany = N0_fourier_grid + 2;
+    // ptrdiff_t local_n0, local_0_start;
+    // ptrdiff_t local_n1, local_1_start;
+
+    // const ptrdiff_t ndimens[] = {N0_fourier_grid, N0_fourier_grid};
+
+    //  size_to_transpose = FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED(rank, ndimens, howmany,
+    //                  FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+    //                      MPI_COMM_WORLD,&local_n0, &local_0_start,&local_n1, &local_1_start);
+
+    // printf("size_to_transpose = %td\n",size_to_transpose);
+
+    transpose_output_plan = FFTW_MPI_PLAN_MANY_TRANSPOSE(N0_fourier_grid, N0_fourier_grid,
+                                                         howmany, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                                                         ptr_return_as_real_field, ptr_return_as_real_field,
+                                                         MPI_COMM_WORLD, flags);
+
+    //printf("Transpose plan completed.\n");
+
+    FFTW_MPI_EXECUTE_R2R(transpose_output_plan, ptr_return_as_real_field, ptr_return_as_real_field);
+
+    //printf("Transpose completed.\n");
+  };
+
+  // Free all memory assigned by FFTW_MALLOC
+  FFTW_FREE(mode_weightings);
+  FFTW_FREE(sph_bessel_coeff);
+  FFTW_FREE(panphasia_coefficients);
+
+  if (!SHARED_FOUR_PAN_SPACE)
+    FFTW_FREE(fourier_grids);
+
+  FFTW_DESTROY_PLAN(output_coeff_forward_plan);
+
+  //printf("Reached end of PANPHASIA_compute_kspace_field_\n");
+  return (0);
+}
+
+//==========================================================================================
+//==========================================================================================
--- a/external/panphasia_ho/panphasia_functions.h
+++ b/external/panphasia_ho/panphasia_functions.h
@ -0,0 +1,99 @@
+
+/////////////////////////////////////////////////
+//  By default Panphasia is computed at single
+//  precision.  To override this define PAN_DOUBLE
+
+#pragma once
+
+#ifndef USE_PRECISION_FLOAT
+#define PAN_DOUBLE_PRECISION  8
+#endif
+
+#ifndef PAN_DOUBLE_PRECISION
+
+#define PAN_REAL float
+#define PAN_COMPLEX float complex
+
+#else 
+
+#define PAN_REAL double
+#define PAN_COMPLEX  double complex
+
+#endif
+
+
+#include "PAN_FFTW3.h"
+
+/////////////////////////////////////////////////////////////////////
+
+
+
+void return_uniform_pseudo_rands_threefry4x64_(size_t l,size_t j1,size_t j2,size_t j3,
+					       PAN_REAL *panphasia_randoms, size_t seed_value,
+					       size_t allow_non_zero_seed_safety_catch);
+
+void box_muller_(PAN_REAL *unif_real,PAN_REAL *gvar);
+
+void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_cell_vec, int control_flag);
+
+void threefry4x64_test_(int verbose);
+void inverse_threefry4x64_test_(int verbose);
+void set_panphasia_key_(int verbose);
+void check_panphasia_key_(int verbose);
+
+void PANPHASIA_init_descriptor_checks();
+
+void speed_test_();
+void speed_test2_();
+void check_randoms_();
+void test_random_dist_(size_t shift);
+void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3,
+					   PAN_REAL *gauss_rand_parent, PAN_REAL *legendre_rand);
+void return_root_legendre_coefficients_(PAN_REAL *root);
+
+
+int parse_and_validate_descriptor_(const char *, int *);
+int demo_descriptor_();
+long long int compute_check_digit_();
+int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose);
+int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose);
+
+
+int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart,
+			  size_t *xextent, size_t *yextent, size_t *zextend,
+			  size_t *copy_list, 
+				    size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose);
+
+void test_moments_();
+void test_propogation_of_moments_(int iterations);
+void test_cell_moments(const char*,size_t, size_t, size_t, size_t, size_t, double *);
+
+void spherical_bessel_(int *, double *, double *);
+
+
+
+
+
+
+void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs);
+
+int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen,
+		      size_t *cuboid_y_dimen,size_t *cuboid_z_dimen, size_t *cell_lev,
+                      size_t *cell_x, size_t *cell_y, size_t *cell_z, size_t number_children,
+                      size_t *child_cell_indices);
+
+int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates, 
+				   size_t extent, size_t *return_tree_list_coordinates, 
+				  size_t nreturn,
+				  long long int *child_pointer, size_t *level_count, size_t *level_begin, size_t *index_perm);
+
+
+
+
+#ifdef __cplusplus
+void compute_sph_bessel_coeffs(int, int, int, int, std::complex<double>* *);
+#else
+void compute_sph_bessel_coeffs(int, int, int, int, double complex *);
+#endif
+
+int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *);
--- a/external/panphasia_ho/threefry.h
+++ b/external/panphasia_ho/threefry.h
@ -0,0 +1,874 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _threefry_dot_h_
+#define _threefry_dot_h_
+#include "features/compilerfeatures.h"
+#include "array.h"
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+/* Significant parts of this file were copied from
+   from:
+      Skein_FinalRnd/ReferenceImplementation/skein.h
+      Skein_FinalRnd/ReferenceImplementation/skein_block.c
+
+   in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
+
+   This file has been modified so that it may no longer perform its originally
+   intended function.  If you're looking for a Skein or Threefish source code,
+   please consult the original file.
+
+   The original file had the following header:
+**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+
+*/
+
+/* See comment at the top of philox.h for the macro pre-process
+   strategy. */
+
+/* Rotation constants: */
+enum r123_enum_threefry64x4 {
+    /* These are the R_256 constants from the Threefish reference sources
+       with names changed to R_64x4... */
+    R_64x4_0_0=14, R_64x4_0_1=16,
+    R_64x4_1_0=52, R_64x4_1_1=57,
+    R_64x4_2_0=23, R_64x4_2_1=40,
+    R_64x4_3_0= 5, R_64x4_3_1=37,
+    R_64x4_4_0=25, R_64x4_4_1=33,
+    R_64x4_5_0=46, R_64x4_5_1=12,
+    R_64x4_6_0=58, R_64x4_6_1=22,
+    R_64x4_7_0=32, R_64x4_7_1=32
+};
+
+enum r123_enum_threefry64x2 {
+    /*
+    // Output from skein_rot_search: (srs64_B64-X1000)
+    // Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
+    // Start: Tue Mar  1 10:07:48 2011
+    // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format   
+    */
+    R_64x2_0_0=16,
+    R_64x2_1_0=42,
+    R_64x2_2_0=12,
+    R_64x2_3_0=31,
+    R_64x2_4_0=16,
+    R_64x2_5_0=32,
+    R_64x2_6_0=24,
+    R_64x2_7_0=21
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  8  [  8  8  8  8 ]
+    // 6 rounds: minHW = 16  [ 16 16 16 16 ]
+    // 7 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 8 rounds: minHW = 64  [ 64 64 64 64 ]
+    // 9 rounds: minHW = 64  [ 64 64 64 64 ]
+    //10 rounds: minHW = 64  [ 64 64 64 64 ]
+    //11 rounds: minHW = 64  [ 64 64 64 64 ] */
+};
+
+enum r123_enum_threefry32x4 {
+    /* Output from skein_rot_search: (srs-B128-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Mon Aug 24 22:41:36 2009
+    // ...
+    // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format    */
+    R_32x4_0_0=10, R_32x4_0_1=26,
+    R_32x4_1_0=11, R_32x4_1_1=21,
+    R_32x4_2_0=13, R_32x4_2_1=27,
+    R_32x4_3_0=23, R_32x4_3_1= 5,
+    R_32x4_4_0= 6, R_32x4_4_1=20,
+    R_32x4_5_0=17, R_32x4_5_1=11,
+    R_32x4_6_0=25, R_32x4_6_1=10,
+    R_32x4_7_0=18, R_32x4_7_1=20
+
+    /* 4 rounds: minHW =  3  [  3  3  3  3 ]
+    // 5 rounds: minHW =  7  [  7  7  7  7 ]
+    // 6 rounds: minHW = 12  [ 13 12 13 12 ]
+    // 7 rounds: minHW = 22  [ 22 23 22 23 ]
+    // 8 rounds: minHW = 31  [ 31 31 31 31 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+
+};
+
+enum r123_enum_threefry32x2 {
+    /* Output from skein_rot_search (srs32x2-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Tue Jul 12 11:11:33 2011
+    // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format   */
+    R_32x2_0_0=13,
+    R_32x2_1_0=15,
+    R_32x2_2_0=26,
+    R_32x2_3_0= 6,
+    R_32x2_4_0=17,
+    R_32x2_5_0=29,
+    R_32x2_6_0=16,
+    R_32x2_7_0=24
+
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  6  [  6  8  6  8 ]
+    // 6 rounds: minHW =  9  [  9 12  9 12 ]
+    // 7 rounds: minHW = 16  [ 16 24 16 24 ]
+    // 8 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+    };
+
+enum r123_enum_threefry_wcnt {
+    WCNT2=2,
+    WCNT4=4
+};
+
+#if R123_USE_64BIT
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
+{
+    return (x << (N & 63)) | (x >> ((64-N) & 63));
+}
+#endif
+
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+    return (x << (N & 31)) | (x >> ((32-N) & 31));
+}
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY64         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_KS_PARITY32         0x1BD11BDA
+
+/** \endcond */
+
+#ifndef THREEFRY2x32_DEFAULT_ROUNDS
+#define THREEFRY2x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY2x64_DEFAULT_ROUNDS
+#define THREEFRY2x64_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x32_DEFAULT_ROUNDS
+#define THREEFRY4x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x64_DEFAULT_ROUNDS
+#define THREEFRY4x64_DEFAULT_ROUNDS 20
+#endif
+
+#define _threefry2x_tpl(W)                                              \
+typedef struct r123array2x##W threefry2x##W##_ctr_t;                          \
+typedef struct r123array2x##W threefry2x##W##_key_t;                          \
+typedef struct r123array2x##W threefry2x##W##_ukey_t;                          \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    uint##W##_t X0,X1; \
+    uint##W##_t ks0, ks1, ks2; \
+    R123_ASSERT(Nrounds<=32);                                           \
+    ks2 =  SKEIN_KS_PARITY##W;                                   \
+    ks0 = k.v[0];                                                       \
+    X0  = in.v[0] + ks0;                                                  \
+    ks2 ^= ks0; \
+\
+    ks1 = k.v[1];                                                       \
+    X1  = in.v[1] + ks1;                                                  \
+    ks2 ^= ks1; \
+                                                                        \
+    if(Nrounds>0){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
+    if(Nrounds>1){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
+    if(Nrounds>2){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
+    if(Nrounds>3){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks1; X1 += ks2;                               \
+        X1 += 1;     /* X.v[2-1] += r  */                   \
+    }                                                                   \
+    if(Nrounds>4){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
+    if(Nrounds>5){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
+    if(Nrounds>6){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
+    if(Nrounds>7){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X0 += ks2; X1 += ks0;                               \
+        X1 += 2;                                                    \
+    }                                                                   \
+    if(Nrounds>8){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
+    if(Nrounds>9){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
+    if(Nrounds>10){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
+    if(Nrounds>11){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X0 += ks0; X1 += ks1;                               \
+        X1 += 3;                                                    \
+    }                                                                   \
+    if(Nrounds>12){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
+    if(Nrounds>13){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
+    if(Nrounds>14){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
+    if(Nrounds>15){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=4) */                                            \
+        X0 += ks1; X1 += ks2;                               \
+        X1 += 4;                                                    \
+    }                                                                   \
+    if(Nrounds>16){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
+    if(Nrounds>17){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
+    if(Nrounds>18){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
+    if(Nrounds>19){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=5) */                                            \
+        X0 += ks2; X1 += ks0;                               \
+        X1 += 5;                                                    \
+    }                                                                   \
+    if(Nrounds>20){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
+    if(Nrounds>21){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
+    if(Nrounds>22){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
+    if(Nrounds>23){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=6) */                                            \
+        X0 += ks0; X1 += ks1;                               \
+        X1 += 6;                                                    \
+    }                                                                   \
+    if(Nrounds>24){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
+    if(Nrounds>25){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
+    if(Nrounds>26){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
+    if(Nrounds>27){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=7) */                                            \
+        X0 += ks1; X1 += ks2;                               \
+        X1 += 7;                                                    \
+    }                                                                   \
+    if(Nrounds>28){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
+    if(Nrounds>29){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
+    if(Nrounds>30){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
+    if(Nrounds>31){  X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=8) */                                            \
+        X0 += ks2; X1 += ks0;                               \
+        X1 += 8;                                                    \
+    }                                                                   \
+    threefry2x##W##_ctr_t ret={{X0, X1}};                                   \
+    return ret;                                                             \
+}                                                                       \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    return threefry2x##W##_R(threefry2x##W##_rounds, in, k);            \
+}
+
+
+#define _threefry4x_tpl(W)                                              \
+typedef struct r123array4x##W threefry4x##W##_ctr_t;                        \
+typedef struct r123array4x##W threefry4x##W##_key_t;                        \
+typedef struct r123array4x##W threefry4x##W##_ukey_t;                        \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    uint##W##_t X0, X1, X2, X3; \
+    uint##W##_t ks0, ks1, ks2, ks3, ks4;                          \
+    R123_ASSERT(Nrounds<=72);                                           \
+    ks4 =  SKEIN_KS_PARITY##W;                                    \
+    ks0 = k.v[0]; \
+    X0 = in.v[0] + ks0;  \
+    ks4 ^= ks0;                            \
+    \
+    ks1 = k.v[1]; \
+    X1 = in.v[1] + ks1;  \
+    ks4 ^= ks1;                            \
+    \
+    ks2 = k.v[2]; \
+    X2 = in.v[2] + ks2;  \
+    ks4 ^= ks2;                            \
+    \
+    ks3 = k.v[3]; \
+    X3 = in.v[3] + ks3;  \
+    ks4 ^= ks3;                                                    \
+                                                                        \
+    if(Nrounds>0){                                                      \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>1){                                                      \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>2){                                                      \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
+        X3 += 1;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>4){                                                      \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>5){                                                      \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>6){                                                      \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
+        X3 += 2;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>8){                                                      \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>9){                                                      \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>10){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
+        X3 += 3;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>12){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>13){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>14){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
+        X3 += 4;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>16){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>17){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>18){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
+        X3 += 5;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>20){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>21){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>22){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
+        X3 += 6;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>24){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>25){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>26){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
+        X3 += 7;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>28){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>29){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>30){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
+        X3 += 8;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>32){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>33){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>34){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
+        X3 += 9;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>36){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>37){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>38){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
+        X3 += 10;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>40){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>41){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>42){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
+        X3 += 11;     /* XWCNT4-1 += r  */                \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>44){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>45){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>46){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
+        X3 += 12;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>48){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>49){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>50){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
+        X3 += 13;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>52){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>53){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>54){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
+        X3 += 14;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>56){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>57){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>58){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
+        X3 += 15;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>60){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>61){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>62){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
+        X3 += 16;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>64){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>65){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>66){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
+        X3 += 17;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>68){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>69){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>70){                                                     \
+        X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
+        X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
+        X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
+        X3 += 18;     /* XWCNT4-1 += r  */                 \
+    }                                                                   \
+                                                                        \
+    threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
+    return ret;                                                           \
+}                                                                       \
+                                                                        \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    return threefry4x##W##_R(threefry4x##W##_rounds, in, k);            \
+}
+
+#if R123_USE_64BIT
+_threefry2x_tpl(64)
+_threefry4x_tpl(64)
+#endif
+_threefry2x_tpl(32)
+_threefry4x_tpl(32)
+
+/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
+   than a static inline function.  Why?  */
+#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
+#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
+#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
+#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
+
+#if defined(__cplusplus)
+#define _threefryNxWclass_tpl(NxW)                                      \
+namespace r123{                                                     \
+template<unsigned int ROUNDS>                                                  \
+ struct Threefry##NxW##_R{                                              \
+    typedef threefry##NxW##_ctr_t ctr_type;                             \
+    typedef threefry##NxW##_key_t key_type;                             \
+    typedef threefry##NxW##_key_t ukey_type;                            \
+    static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS;                            \
+   inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
+        R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
+        return threefry##NxW##_R(ROUNDS, ctr, key);                              \
+    }                                                                   \
+};                                                                      \
+ typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;       \
+} // namespace r123
+
+_threefryNxWclass_tpl(2x32)
+_threefryNxWclass_tpl(4x32)
+#if R123_USE_64BIT
+_threefryNxWclass_tpl(2x64)
+_threefryNxWclass_tpl(4x64)
+#endif
+
+/* The _tpl macros don't quite work to do string-pasting inside comments.
+   so we just write out the boilerplate documentation four times... */
+
+/** 
+@defgroup ThreefryNxW Threefry Classes and Typedefs
+
+The ThreefryNxW classes export the member functions, typedefs and
+operator overloads required by a @ref CBRNG "CBRNG" class.
+
+As described in  
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>, 
+the Threefry family is closely related to the Threefish block cipher from
+<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.  
+Threefry is \b not suitable for cryptographic use.
+
+Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
+
+@class r123::Threefry2x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=13 or more for Threefry2x32.
+
+@typedef r123::Threefry2x32
+@ingroup ThreefryNxW
+  Threefry2x32 is equivalent to Threefry2x32_R<20>.    With 20 rounds,
+  Threefry2x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+@class r123::Threefry2x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+In November 2011, the authors discovered that 13 rounds of
+Threefry2x64 sequenced by strided, interleaved key and counter
+increments failed a very long (longer than the default BigCrush
+length) WeightDistrub test.  At the same time, it was confirmed that
+14 rounds passes much longer tests (up to 5x10^12 samples) of a
+similar nature.  The authors know of no statistical flaws with
+ROUNDS=14 or more for Threefry2x64.
+
+@typedef r123::Threefry2x64
+@ingroup ThreefryNxW
+  Threefry2x64 is equivalent to Threefry2x64_R<20>.    With 20 rounds,
+  Threefry2x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x32.
+
+@typedef r123::Threefry4x32
+@ingroup ThreefryNxW
+  Threefry4x32 is equivalent to Threefry4x32_R<20>.    With 20 rounds,
+  Threefry4x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x64.
+
+@typedef r123::Threefry4x64
+@ingroup ThreefryNxW
+  Threefry4x64 is equivalent to Threefry4x64_R<20>.    With 20 rounds,
+  Threefry4x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+*/
+
+#endif
+
+#endif
--- a/external/panphasia_ho/uniform_rand_threefry4x64.c
+++ b/external/panphasia_ho/uniform_rand_threefry4x64.c
--- a/include/HDF_IO.hh
+++ b/include/HDF_IO.hh
@ -136,15 +136,13 @@ inline void HDFGetDatasetExtent( const std::string Filename, const std::string O

  int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
  
-  hsize_t *dimsize = new hsize_t[ndims];
+  std::vector<hsize_t> dimsize(ndims,0);

-  H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
+  H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );

  Extent.clear();
  for(int i=0; i<ndims; ++i )
    Extent.push_back( dimsize[i] );
-
-  delete[] dimsize;
 	
  H5Sclose( HDF_DataspaceID );
  H5Dclose( HDF_DatasetID );
@ -323,8 +321,8 @@ inline void HDFReadVectorSelect( const std::string Filename, const std::string O
  //... get space associated with dataset and its extensions
  HDF_DataspaceID = H5Dget_space( HDF_DatasetID );
  int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
-  hsize_t dimsize[ndims];
-  H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
+  std::vector<hsize_t> dimsize(ndims,0);
+  H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );

  hsize_t block[2];
  block[0]  = ii.size();
@ -601,9 +599,9 @@ inline void HDFReadGroupAttribute( const std::string Filename, const std::string

  int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
  
-  hsize_t dimsize[ndims];
+  std::vector<hsize_t> dimsize(ndims,0);

-  H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
+  H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );

  HDF_StorageSize = 1;
  for(int i=0; i<ndims; ++i )
--- a/include/cmake_config.hh.in
+++ b/include/cmake_config.hh.in
@ -1,8 +1,10 @@
 #pragma once

-constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";

 #define USE_PRECISION_${CODE_PRECISION}
+
+#ifdef __cplusplus
+constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
 #if defined(USE_PRECISION_FLOAT)
  constexpr char CMAKE_PRECISION_STR[] = "single";
 #elif defined(USE_PRECISION_DOUBLE)
@ -31,4 +33,6 @@ extern "C"
  extern const char *GIT_TAG;
  extern const char *GIT_REV;
  extern const char *GIT_BRANCH;
-}
+
+}
+#endif // __cplusplus
--- a/include/particle_generator.hh
+++ b/include/particle_generator.hh
@ -216,6 +216,10 @@ namespace particle
                const size_t num_p_in_load = field.local_size();
                const real_t pmeanmass = munit / real_t(field.global_size()* overload);

+                bool bmass_negative = false;
+                auto mean_pm = field.mean() * pmeanmass;
+                auto std_pm  = field.std()  * pmeanmass;
+
                for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
                {
                    // if we are dealing with the secondary lattice, apply a global shift
@ -237,18 +241,25 @@ namespace particle
                        {
                            for (size_t k = 0; k < field.size(2); ++k)
                            {
-                                if (b64reals)
-                                {
-                                    particles_.set_mass64(ipcount++, pmeanmass * field.relem(i, j, k));
-                                }
-                                else
-                                {
-                                    particles_.set_mass32(ipcount++, pmeanmass * field.relem(i, j, k));
-                                }
+                                // get
+                                const auto pmass = pmeanmass * field.relem(i, j, k);
+
+                                // check for negative mass
+                                bmass_negative |= pmass<0.0;
+
+                                // set
+                                if (b64reals) particles_.set_mass64(ipcount++, pmass);
+                                else particles_.set_mass32(ipcount++, pmass);
                            }
                        }
                    }
                }
+                
+                // diagnostics
+                music::ilog << "Particle Mass :  mean/munit = " << mean_pm/munit  << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl;
+                if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t  the starting redshift or disabling baryon decaying modes." << std::endl;
+                if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t  redshift or disable baryon decaying modes!" << std::endl;
+
            }else{
                // should not happen
                music::elog << "Cannot have individual particle masses for glasses!" << std::endl;
--- a/src/ic_generator.cc
+++ b/src/ic_generator.cc
@ -591,6 +591,10 @@ int run( config_file& the_config )
                        }
                    }
                }
+                #if defined(USE_MPI)
+                    real_t local_maxdphi = maxdphi;
+                    MPI_Allreduce( &local_maxdphi, &maxdphi, 1, MPI::get_datatype<real_t>(), MPI_MAX, MPI_COMM_WORLD );
+                #endif
                const real_t hbar_safefac = 1.01;
                const real_t hbar = maxdphi / M_PI / Dplus0 * hbar_safefac;
                music::ilog << "Semiclassical PT : hbar = " << hbar << " (limited by initial potential, safety=" << hbar_safefac << ")." << std::endl;
@ -832,6 +836,9 @@ int run( config_file& the_config )
            }

        }
+        
+        music::ilog << "-------------------------------------------------------------------------------" << std::endl;
+        
    }
    return 0;
 }
--- a/src/plugins/PANPHASIA.hh
+++ b/src/plugins/PANPHASIA.hh
@ -0,0 +1,498 @@
+#pragma once
+
+namespace PANPHASIA1
+{
+  const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
+  typedef int rand_offset_[5];
+  typedef struct
+  {
+    int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
+    int need_fill;
+    int pos;
+  } rand_state_;
+
+  typedef struct
+  {
+    int base_state[5], base_lev_start[5][maxdim + 1];
+    rand_offset_ poweroffset[maxpow + 1], superjump;
+    rand_state_ current_state[maxpow + 2];
+
+    int layer_min, layer_max, indep_field;
+
+    long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
+    int lev_common, layer_min_store, layer_max_store;
+    long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
+        iy_rel_store, iz_rel_store;
+    double exp_coeffs[8][8][maxdim + 2];
+    long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
+    int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
+
+    double cell_data[9][8];
+    int ixh_last, iyh_last, izh_last;
+    int init;
+
+    int init_cell_props;
+    int init_lecuyer_state;
+    long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
+
+  } pan_state_;
+
+  extern "C"
+  {
+    void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
+
+    void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
+                           int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
+
+    void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
+
+    void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
+                                        int *layer_max, int *indep_field, double *cell_prop);
+
+    void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
+                                    long long *iy_rel, long long *iz_rel, int *VERBOSE);
+  }
+
+  struct RNG
+  {
+
+    config_file *pcf_;
+
+    struct panphasia_descriptor
+    {
+      int16_t wn_level_base;
+      int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
+      int16_t i_base, i_base_y, i_base_z;
+      int32_t check_rand;
+      std::string name;
+
+      explicit panphasia_descriptor(std::string dstring)
+      {
+        char tmp[100];
+        std::memset(tmp, ' ', 100);
+        parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
+                          &i_base_y, &i_base_z, &check_rand, tmp);
+        for (int i = 0; i < 100; i++)
+          if (tmp[i] == ' ')
+          {
+            tmp[i] = '\0';
+            break;
+          }
+        name = tmp;
+        name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
+      }
+    };
+
+    // greatest common divisor
+    int gcd(int a, int b)
+    {
+      if (b == 0)
+        return a;
+      return gcd(b, a % b);
+    }
+
+    // least common multiple
+    int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
+
+    // Two or largest power of 2 less than the argument
+    int largest_power_two_lte(int b)
+    {
+      int a = 1;
+      if (b <= a)
+        return a;
+      while (2 * a < b)
+        a = 2 * a;
+      return a;
+    }
+
+    std::string descriptor_string_;
+    int num_threads_;
+    int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
+    bool incongruent_fields_;
+    double inter_grid_phase_adjustment_;
+    // double translation_phase_;
+    pan_state_ *lstate;
+    int grid_p_;
+    int coordinate_system_shift_[3];
+    int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
+
+    void clear_panphasia_thread_states(void)
+    {
+      for (int i = 0; i < num_threads_; ++i)
+      {
+        lstate[i].init = 0;
+        lstate[i].init_cell_props = 0;
+        lstate[i].init_lecuyer_state = 0;
+      }
+    }
+
+    void initialize_for_grid_structure(void)
+    {
+      // if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
+      ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
+      int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution", 512);
+
+      grid_p_ = pdescriptor_->i_base;
+
+      lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
+      // lmin
+
+      ngrid_panphasia_ = (1 << lextra_) * grid_p_;
+
+      while (ngrid_panphasia_ < ngridminsize_panphasia)
+      {
+        lextra_++;
+        ngrid_panphasia_ *= 2;
+      }
+      assert(ngrid_panphasia_ >= ngridminsize_panphasia);
+
+      clear_panphasia_thread_states();
+
+      music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)", ngrid_panphasia_, lextra_);
+      if (ngridminsize_panphasia < 512)
+        music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512", ngridminsize_panphasia);
+      music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_);
+
+      coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
+      coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
+      coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
+    }
+
+    std::unique_ptr<panphasia_descriptor> pdescriptor_;
+
+    RNG(config_file *pcf)
+        : pcf_(pcf)
+    {
+      descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
+
+#ifdef _OPENMP
+      num_threads_ = omp_get_max_threads();
+#else
+      num_threads_ = 1;
+#endif
+
+      // create independent state descriptions for each thread
+      lstate = new pan_state_[num_threads_];
+
+      // parse the descriptor for its properties
+      pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
+
+      music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
+
+      // write panphasia base size into config file for the grid construction
+      // as the gridding unit we use the least common multiple of 2 and i_base
+      std::stringstream ss;
+      //ARJ  ss << lcm(2, pdescriptor_->i_base);
+      //ss <<  two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
+      ss << 2; //ARJ - set gridding unit to two
+      pcf_->insert_value("setup", "gridding_unit", ss.str());
+      ss.str(std::string());
+      ss << pdescriptor_->i_base;
+      pcf_->insert_value("random", "base_unit", ss.str());
+
+      this->initialize_for_grid_structure();
+    }
+
+    ~RNG() { delete[] lstate; }
+
+    void Fill(Grid_FFT<real_t> &g)
+    {
+      auto sinc = [](real_t x)
+      { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
+      auto dsinc = [](real_t x)
+      { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
+      const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
+
+      // we will overwrite 'g', we can deallocate it while we prepare the panphasia field
+      g.reset();
+
+      clear_panphasia_thread_states();
+
+      // temporaries
+      Grid_FFT<real_t> g0({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
+      Grid_FFT<real_t> g1({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
+      Grid_FFT<real_t> g2({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
+      Grid_FFT<real_t> g3({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
+      Grid_FFT<real_t> g4({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
+
+      double t1 = get_wtime();
+      // double tp = t1;
+
+#pragma omp parallel
+      {
+#ifdef _OPENMP
+        const int mythread = omp_get_thread_num();
+#else
+        const int mythread = 0;
+#endif
+
+        //int odd_x, odd_y, odd_z;
+        //int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
+
+        int verbosity = (mythread == 0);
+        char descriptor[100];
+        std::memset(descriptor, 0, 100);
+        std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
+
+        start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
+
+        {
+          panphasia_descriptor d(descriptor_string_);
+
+          int level_p = d.wn_level_base + lextra_;
+
+          lstate[mythread].layer_min = 0;
+          lstate[mythread].layer_max = level_p;
+          lstate[mythread].indep_field = 1;
+
+          long long ix_rel[3];
+          ix_rel[0] = 0; //ileft_corner_p[0];
+          ix_rel[1] = 0; //ileft_corner_p[1];
+          ix_rel[2] = 0; //ileft_corner_p[2];
+
+          set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
+                                     &verbosity);
+        }
+
+        if (verbosity)
+          t1 = get_wtime();
+
+        std::array<double, 9> cell_prop;
+        pan_state_ *ps = &lstate[mythread];
+
+#pragma omp for //nowait
+        for (size_t i = 0; i < g0.size(0); i += 2)
+        {
+          const int ixmax(std::min<int>(2, g0.size(0) - i));
+          for (size_t j = 0; j < g0.size(1); j += 2)
+          {
+            const int iymax(std::min<int>(2, g0.size(1) - j));
+            for (size_t k = 0; k < g0.size(2); k += 2)
+            {
+              const int izmax(std::min<int>(2, g0.size(2) - k));
+
+              // ARJ - added inner set of loops to speed up evaluation of Panphasia
+              for (int ix = 0; ix < ixmax; ++ix)
+              {
+                for (int iy = 0; iy < iymax; ++iy)
+                {
+                  for (int iz = 0; iz < izmax; ++iz)
+                  {
+                    int ilocal = i + ix;
+                    int jlocal = j + iy;
+                    int klocal = k + iz;
+
+                    int iglobal = ilocal + g0.local_0_start_;
+                    int jglobal = jlocal;
+                    int kglobal = klocal;
+
+                    adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
+                                                   &ps->layer_max, &ps->indep_field, &cell_prop[0]);
+
+                    g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
+                    g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
+                    g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
+                    g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
+                    g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
+                  }
+                }
+              }
+            }
+          }
+        }
+      } // end omp parallel region
+
+      g0.FourierTransformForward();
+      g1.FourierTransformForward();
+      g2.FourierTransformForward();
+      g3.FourierTransformForward();
+      g4.FourierTransformForward();
+
+#pragma omp parallel for
+      for (size_t i = 0; i < g0.size(0); i++)
+      {
+        for (size_t j = 0; j < g0.size(1); j++)
+        {
+          for (size_t k = 0; k < g0.size(2); k++)
+          {
+            if (!g0.is_nyquist_mode(i, j, k))
+            {
+              auto kvec = g0.get_k<real_t>(i, j, k);
+
+              auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
+              auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
+              auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
+
+              auto fx = real_t(sinc(argx));
+              auto gx = ccomplex_t(0.0, dsinc(argx));
+              auto fy = real_t(sinc(argy));
+              auto gy = ccomplex_t(0.0, dsinc(argy));
+              auto fz = real_t(sinc(argz));
+              auto gz = ccomplex_t(0.0, dsinc(argz));
+
+              auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
+              auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
+
+              auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
+
+              g0.kelem(i, j, k) = y0 * fx * fy * fz + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) + y4 * magnitude;
+            }
+            else
+            {
+              g0.kelem(i, j, k) = 0.0;
+            }
+          }
+        }
+      }
+
+      // music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
+      // tp = get_wtime();
+
+      g1.FourierTransformBackward(false);
+      g2.FourierTransformBackward(false);
+      g3.FourierTransformBackward(false);
+      g4.FourierTransformBackward(false);
+
+#pragma omp parallel
+      {
+#ifdef _OPENMP
+        const int mythread = omp_get_thread_num();
+#else
+        const int mythread = 0;
+#endif
+
+        // int odd_x, odd_y, odd_z;
+        int verbosity = (mythread == 0);
+        char descriptor[100];
+        std::memset(descriptor, 0, 100);
+        std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
+
+        start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
+
+        {
+          panphasia_descriptor d(descriptor_string_);
+
+          int level_p = d.wn_level_base + lextra_;
+
+          lstate[mythread].layer_min = 0;
+          lstate[mythread].layer_max = level_p;
+          lstate[mythread].indep_field = 1;
+
+          long long ix_rel[3];
+          ix_rel[0] = 0; //ileft_corner_p[0];
+          ix_rel[1] = 0; //ileft_corner_p[1];
+          ix_rel[2] = 0; //ileft_corner_p[2];
+
+          set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
+                                     &verbosity);
+        }
+
+        if (verbosity)
+          t1 = get_wtime();
+
+        //***************************************************************
+        // Process Panphasia values: p110, p011, p101, p111
+        //****************************************************************
+        std::array<double, 9> cell_prop;
+        pan_state_ *ps = &lstate[mythread];
+
+#pragma omp for //nowait
+        for (size_t i = 0; i < g1.size(0); i += 2)
+        {
+          const int ixmax(std::min<int>(2, g1.size(0) - i));
+          for (size_t j = 0; j < g1.size(1); j += 2)
+          {
+            const int iymax(std::min<int>(2, g1.size(1) - j));
+            for (size_t k = 0; k < g1.size(2); k += 2)
+            {
+              const int izmax(std::min<int>(2, g1.size(2) - k));
+
+              // ARJ - added inner set of loops to speed up evaluation of Panphasia
+              for (int ix = 0; ix < ixmax; ++ix)
+              {
+                for (int iy = 0; iy < iymax; ++iy)
+                {
+                  for (int iz = 0; iz < izmax; ++iz)
+                  {
+                    int ilocal = i + ix;
+                    int jlocal = j + iy;
+                    int klocal = k + iz;
+
+                    int iglobal = ilocal + g1.local_0_start_;
+                    int jglobal = jlocal;
+                    int kglobal = klocal;
+
+                    adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
+                                                   &ps->layer_max, &ps->indep_field, &cell_prop[0]);
+
+                    g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
+                    g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
+                    g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
+                    g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
+                  }
+                }
+              }
+            }
+          }
+        }
+      } // end omp parallel region
+
+      // music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
+      // tp = get_wtime();
+
+      /////////////////////////////////////////////////////////////////////////
+      // transform and convolve with Legendres
+      g1.FourierTransformForward();
+      g2.FourierTransformForward();
+      g3.FourierTransformForward();
+      g4.FourierTransformForward();
+
+#pragma omp parallel for
+      for (size_t i = 0; i < g0.size(0); i++)
+      {
+        for (size_t j = 0; j < g0.size(1); j++)
+        {
+          for (size_t k = 0; k < g0.size(2); k++)
+          {
+            if (!g0.is_nyquist_mode(i, j, k))
+            {
+              auto kvec = g0.get_k<real_t>(i, j, k);
+
+              auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
+              auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
+              auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
+
+              auto fx = real_t(sinc(argx));
+              auto gx = ccomplex_t(0.0, dsinc(argx));
+              auto fy = real_t(sinc(argy));
+              auto gy = ccomplex_t(0.0, dsinc(argy));
+              auto fz = real_t(sinc(argz));
+              auto gz = ccomplex_t(0.0, dsinc(argz));
+
+              auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
+
+              g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
+
+              // do final phase shift to account for corner centered coordinates vs. cell centers
+              auto phase_shift = -0.5 * M_PI * (kvec[0] / g0.kny_[0] + kvec[1] / g0.kny_[1] + kvec[2] / g0.kny_[2]);
+
+              g0.kelem(i, j, k) *= std::exp(ccomplex_t(0, phase_shift));
+            }
+          }
+        }
+      }
+
+      g1.reset();
+      g2.reset();
+      g3.reset();
+      g4.reset();
+
+      g.allocate();
+      g0.FourierInterpolateCopyTo(g);
+
+      music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
+                        1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
+      music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
+    }
+  };
+};
--- a/src/plugins/output_grafic2.cc
+++ b/src/plugins/output_grafic2.cc
@ -51,7 +51,7 @@ protected:
    real_t lunit_, vunit_, munit_, omegab_;
    uint32_t levelmin_;
    bool bhavebaryons_;
-    std::vector<float> data_buf_;
+    std::vector<float> data_buf_, data_buf_write_;
    std::string dirname_;
    bool bUseSPT_;

@ -207,52 +207,59 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
    std::string file_name = this->get_file_name(s, c);

    // serialize parallel write
-    for (int write_rank = 0; write_rank < CONFIG::MPI_task_size; ++write_rank)
+    if (CONFIG::MPI_task_rank == 0)
    {
-        if (write_rank == CONFIG::MPI_task_rank)
+        unlink(file_name.c_str());
+    }
+
+    std::ofstream *pofs;
+
+    // write header or seek to end of file
+    if (CONFIG::MPI_task_rank == 0)
+    {
+        pofs = new std::ofstream(file_name.c_str(), std::ios::binary|std::ios::app);
+        uint32_t blocksz = sizeof(header);
+        pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
+        pofs->write(reinterpret_cast<const char *>(&header_), blocksz);
+        pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
+    }
+
+    // check field size against buffer size...
+    uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");
+    assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
+    assert( g.size(1) == ngrid && g.size(2) == ngrid);
+    // write actual field slice by slice
+    // std::cerr << write_rank << ">" << g.size(0) << " " << g.size(1) << " " << g.size(2) << std::endl;
+    for (size_t i = 0; i < g.size(2); ++i)
+    {   
+        data_buf_.assign(ngrid * ngrid, 0.0f);
+
+        for (unsigned j = 0; j < g.size(1); ++j)
        {
-            if (write_rank == 0)
+            for (unsigned k = 0; k < g.size(0); ++k)
            {
-                unlink(file_name.c_str());
+                data_buf_[j * ngrid + (k+g.local_0_start_)] = g.relem(k, j, i);
            }
-            std::ofstream ofs(file_name.c_str(), std::ios::binary|std::ios::app);
-
-            // write header or seek to end of file
-            if (write_rank == 0)
-            {
-                uint32_t blocksz = sizeof(header);
-                ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
-                ofs.write(reinterpret_cast<const char *>(&header_), blocksz);
-                ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
-            }
-
-            // check field size against buffer size...
-            uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");
-            assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
-            assert( g.size(1) == ngrid && g.size(2) == ngrid);
-            // write actual field slice by slice
-            for (size_t i = 0; i < g.size(2); ++i)
-            {
-                for (unsigned j = 0; j < g.size(1); ++j)
-                {
-                    for (unsigned k = 0; k < g.size(0); ++k)
-                    {
-                        data_buf_[j * ngrid + k] = g.relem(k, j, i);
-                    }
-                }
-
-                uint32_t blocksz = ngrid * ngrid * sizeof(float);
-                ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
-                ofs.write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
-                ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
-            }
-
-            ofs.close();
        }
+#if defined(USE_MPI)
+        if( CONFIG::MPI_task_rank == 0 ) data_buf_write_.assign(ngrid*ngrid,0.0f);
+        MPI_Reduce( &data_buf_[0], &data_buf_write_[0], ngrid*ngrid, MPI::get_datatype<float>(), MPI_SUM, 0, MPI_COMM_WORLD );
+        if( CONFIG::MPI_task_rank == 0 ) data_buf_.swap(data_buf_write_);
+#endif

-        multitask_sync_barrier();
+        if( CONFIG::MPI_task_rank == 0 )
+        {
+            uint32_t blocksz = ngrid * ngrid * sizeof(float);
+            pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
+            pofs->write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
+            pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
+        }
+    }

-    } // end loop over write_rank
+    if( CONFIG::MPI_task_rank == 0 ){
+        pofs->close();
+        delete pofs;
+    }

    music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
 }
--- a/src/plugins/random_panphasia.cc
+++ b/src/plugins/random_panphasia.cc
@ -1,18 +1,18 @@
 // This file is part of monofonIC (MUSIC2)
 // A software package to generate ICs for cosmological simulations
-// Copyright (C) 2020 by Oliver Hahn and Adrian Jenkins (this file)
+// Copyright (C) 2021 by Oliver Hahn and Adrian Jenkins (this file)
 // but see distinct licensing for PANPHASIA below
-// 
+//
 // monofonIC is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
-// 
+//
 // monofonIC is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
-// 
+//
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
@ -39,108 +39,16 @@

 #include <grid_fft.hh>

-const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
-typedef int rand_offset_[5];
-typedef struct
+namespace PANPHASIA2
 {
-  int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
-  int need_fill;
-  int pos;
-} rand_state_;
-
-/* pan_state_ struct -- corresponds to respective fortran module in panphasia_routines.f
- * data structure that contains all panphasia state variables
- * it needs to get passed between the fortran routines to enable
- * thread-safe execution.
- */
-typedef struct
-{
-  int base_state[5], base_lev_start[5][maxdim + 1];
-  rand_offset_ poweroffset[maxpow + 1], superjump;
-  rand_state_ current_state[maxpow + 2];
-
-  int layer_min, layer_max, indep_field;
-
-  long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
-  int lev_common, layer_min_store, layer_max_store;
-  long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
-      iy_rel_store, iz_rel_store;
-  double exp_coeffs[8][8][maxdim + 2];
-  long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
-  int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
-
-  double cell_data[9][8];
-  int ixh_last, iyh_last, izh_last;
-  int init;
-
-  int init_cell_props;
-  int init_lecuyer_state;
-  long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
-
-} pan_state_;
-
-extern "C"
-{
-  void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
-
-  void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
-                         int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
-
-  void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
-
-  void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
-                                      int *layer_max, int *indep_field, double *cell_prop);
-
-  void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
-                                  long long *iy_rel, long long *iz_rel, int *VERBOSE);
-}
-
-struct panphasia_descriptor
-{
-  int16_t wn_level_base;
-  int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
-  int16_t i_base, i_base_y, i_base_z;
-  int32_t check_rand;
-  std::string name;
-
-  explicit panphasia_descriptor(std::string dstring)
+  extern "C"
  {
-    char tmp[100];
-    std::memset(tmp, ' ', 100);
-    parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
-                      &i_base_y, &i_base_z, &check_rand, tmp);
-    for (int i = 0; i < 100; i++)
-      if (tmp[i] == ' ')
-      {
-        tmp[i] = '\0';
-        break;
-      }
-    name = tmp;
-    name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
+#include <panphasia_ho/panphasia_functions.h>
+    extern size_t descriptor_base_size;
  }
-};
-
-// greatest common divisor
-int gcd(int a, int b)
-{
-  if (b == 0)
-    return a;
-  return gcd(b, a % b);
 }

-// least common multiple
-int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
-
-// Two or largest power of 2 less than the argument
-int largest_power_two_lte(int b)
-{
-  int a = 1;
-  if (b <= a)
-    return a;
-  while (2 * a < b)
-    a = 2 * a;
-  return a;
-}
+#include "PANPHASIA.hh"

 class RNG_panphasia : public RNG_plugin
 {
@ -148,63 +56,15 @@ private:
 protected:
  std::string descriptor_string_;
  int num_threads_;
-  int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
-  bool incongruent_fields_;
-  double inter_grid_phase_adjustment_;
-  // double translation_phase_;
-  pan_state_ *lstate;
-  int grid_p_;
-  int coordinate_system_shift_[3];
-  int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
+  int panphasia_mode_;
+  size_t grid_res_;
+  real_t boxlength_;

-  void clear_panphasia_thread_states(void)
-  {
-    for (int i = 0; i < num_threads_; ++i)
-    {
-      lstate[i].init = 0;
-      lstate[i].init_cell_props = 0;
-      lstate[i].init_lecuyer_state = 0;
-    }
-  }
-
-  void initialize_for_grid_structure(void)
-  {
-    // if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
-    ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
-    int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution",512);
-
-    grid_p_ = pdescriptor_->i_base;
-    
-    lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
-    // lmin
-
-    ngrid_panphasia_ = (1 << lextra_) * grid_p_;
-
-    while( ngrid_panphasia_ < ngridminsize_panphasia ){
-      lextra_++;
-      ngrid_panphasia_*=2;
-    }
-    assert( ngrid_panphasia_ >= ngridminsize_panphasia);
-
-   
-    clear_panphasia_thread_states();
-
-    music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)",ngrid_panphasia_, lextra_);
-    if (ngridminsize_panphasia<512) 
-        music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512",ngridminsize_panphasia);
-    music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_ );
-
-    coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
-    coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
-    coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
-  }
-
-  std::unique_ptr<panphasia_descriptor> pdescriptor_;
+  PANPHASIA1::RNG *ppan1_rng_;

 public:
  explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf)
  {
-    descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");

 #ifdef _OPENMP
    num_threads_ = omp_get_max_threads();
@ -212,336 +72,120 @@ public:
    num_threads_ = 1;
 #endif

-    // create independent state descriptions for each thread
-    lstate = new pan_state_[num_threads_];
+    descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
+    grid_res_ = pcf_->get_value<size_t>("setup", "GridRes");
+    boxlength_ = pcf_->get_value<real_t>("setup", "BoxLength");

-    // parse the descriptor for its properties
-    pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
+    if( pcf_->get_value<bool>("setup", "DoFixing") ){
+      music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n";
+      music::flog << "With the new panphasia ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n";
+      throw std::runtime_error("PANPHASIA: incompatible parameter.");
+    }

-    music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
+    panphasia_mode_ = 0;
+    PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_);

-    // write panphasia base size into config file for the grid construction
-    // as the gridding unit we use the least common multiple of 2 and i_base
-    std::stringstream ss;
-    //ARJ  ss << lcm(2, pdescriptor_->i_base);
-    //ss <<  two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
-    ss << 2; //ARJ - set gridding unit to two
-    pcf_->insert_value("setup", "gridding_unit", ss.str());
-    ss.str(std::string());
-    ss << pdescriptor_->i_base;
-    pcf_->insert_value("random", "base_unit", ss.str());
-
-    this->initialize_for_grid_structure();
+    if (panphasia_mode_ == 0)
+    {
+      ppan1_rng_ = new PANPHASIA1::RNG(&cf);
+    }
  }

-  ~RNG_panphasia() { delete[] lstate; }
+  ~RNG_panphasia()
+  {
+    if (panphasia_mode_ == 0)
+    {
+      delete ppan1_rng_;
+    }
+  }

  bool isMultiscale() const { return true; }

+  void Run_Panphasia_Highorder(Grid_FFT<real_t> &g);
+
  void Fill_Grid(Grid_FFT<real_t> &g)
  {
-    auto sinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
-    auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
-    const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
-
-    // we will overwrite 'g', we can deallocate it while we prepare the panphasia field
-    g.reset();
-
-    clear_panphasia_thread_states();
-    
-    // temporaries
-    Grid_FFT<real_t> g0({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
-    Grid_FFT<real_t> g1({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
-    Grid_FFT<real_t> g2({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
-    Grid_FFT<real_t> g3({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
-    Grid_FFT<real_t> g4({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
-
-    double t1 = get_wtime();
-    // double tp = t1;
-
-#pragma omp parallel
+    switch (panphasia_mode_)
    {
-#ifdef _OPENMP
-      const int mythread = omp_get_thread_num();
-#else
-      const int mythread = 0;
-#endif

-      //int odd_x, odd_y, odd_z;
-      //int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
+    case 0: // old mode
+      music::ilog << "PANPHASIA: Old descriptor" << std::endl;
+      ppan1_rng_->Fill(g);
+      break;

-      int verbosity = (mythread == 0);
-      char descriptor[100];
-      std::memset(descriptor, 0, 100);
-      std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
+    case 1: // PANPHASIA HO descriptor
+      music::ilog << "PANPHASIA: New descriptor" << std::endl;
+      this->Run_Panphasia_Highorder(g);
+      break;

-      start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
-
-      {
-        panphasia_descriptor d(descriptor_string_);
-
-        int level_p = d.wn_level_base + lextra_;
-
-        lstate[mythread].layer_min = 0;
-        lstate[mythread].layer_max = level_p;
-        lstate[mythread].indep_field = 1;
-
-        long long ix_rel[3];
-        ix_rel[0] = 0; //ileft_corner_p[0];
-        ix_rel[1] = 0; //ileft_corner_p[1];
-        ix_rel[2] = 0; //ileft_corner_p[2];
-
-        set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
-                                   &verbosity);
-      }
-
-      if (verbosity)
-        t1 = get_wtime();
-
-      std::array<double, 9> cell_prop;
-      pan_state_ *ps = &lstate[mythread];
-
-#pragma omp for //nowait
-      for (size_t i = 0; i < g0.size(0); i += 2)
-      {
-        const int ixmax(std::min<int>(2,g0.size(0)-i));
-        for (size_t j = 0; j < g0.size(1); j += 2)
-        {
-          const int iymax(std::min<int>(2,g0.size(1)-j));
-          for (size_t k = 0; k < g0.size(2); k += 2)
-          {
-            const int izmax(std::min<int>(2,g0.size(2)-k));
-
-            // ARJ - added inner set of loops to speed up evaluation of Panphasia
-            for (int ix = 0; ix < ixmax; ++ix)
-            {
-              for (int iy = 0; iy < iymax; ++iy)
-              {   
-                for (int iz = 0; iz < izmax; ++iz)
-                {
-                  int ilocal = i + ix;
-                  int jlocal = j + iy;
-                  int klocal = k + iz;
-
-                  int iglobal = ilocal + g0.local_0_start_;
-                  int jglobal = jlocal;
-                  int kglobal = klocal;
-
-                  adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
-                                                 &ps->layer_max, &ps->indep_field, &cell_prop[0]);
-
-                  g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
-                  g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
-                  g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
-                  g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
-                  g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
-                }
-              }
-            }
-          }
-        }
-      }
-    } // end omp parallel region
-
-    g0.FourierTransformForward();
-    g1.FourierTransformForward();
-    g2.FourierTransformForward();
-    g3.FourierTransformForward();
-    g4.FourierTransformForward();
-
-#pragma omp parallel for
-    for (size_t i = 0; i < g0.size(0); i++)
-    {
-      for (size_t j = 0; j < g0.size(1); j++)
-      {
-        for (size_t k = 0; k < g0.size(2); k++)
-        {
-          if (!g0.is_nyquist_mode(i, j, k))
-          {
-            auto kvec = g0.get_k<real_t>(i, j, k);
-
-            auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
-            auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
-            auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
-
-            auto fx = real_t(sinc(argx));
-            auto gx = ccomplex_t(0.0, dsinc(argx));
-            auto fy = real_t(sinc(argy));
-            auto gy = ccomplex_t(0.0, dsinc(argy));
-            auto fz = real_t(sinc(argz));
-            auto gz = ccomplex_t(0.0, dsinc(argz));
-
-            auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
-            auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
-
-            auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
-
-            g0.kelem(i, j, k) = y0 * fx * fy * fz 
-                              + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) 
-                              + y4 * magnitude;
-          }
-          else
-          {
-            g0.kelem(i, j, k) = 0.0;
-          }
-        }
-      }
+    default: // unknown PANPHASIA mode
+      music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl;
+      abort();
+      break;
    }
-
-    // music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
-    // tp = get_wtime();
-
-    g1.FourierTransformBackward(false);
-    g2.FourierTransformBackward(false);
-    g3.FourierTransformBackward(false);
-    g4.FourierTransformBackward(false);
-
-#pragma omp parallel
-    {
-#ifdef _OPENMP
-      const int mythread = omp_get_thread_num();
-#else
-      const int mythread = 0;
-#endif
-
-      // int odd_x, odd_y, odd_z;
-      int verbosity = (mythread == 0);
-      char descriptor[100];
-      std::memset(descriptor, 0, 100);
-      std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
-
-      start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
-
-      {
-        panphasia_descriptor d(descriptor_string_);
-
-        int level_p = d.wn_level_base + lextra_;
-
-        lstate[mythread].layer_min = 0;
-        lstate[mythread].layer_max = level_p;
-        lstate[mythread].indep_field = 1;
-
-        long long ix_rel[3];
-        ix_rel[0] = 0; //ileft_corner_p[0];
-        ix_rel[1] = 0; //ileft_corner_p[1];
-        ix_rel[2] = 0; //ileft_corner_p[2];
-
-        set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
-                                   &verbosity);
-      }
-
-      if (verbosity)
-        t1 = get_wtime();
-
-      //***************************************************************
-      // Process Panphasia values: p110, p011, p101, p111
-      //****************************************************************
-      std::array<double,9> cell_prop;
-      pan_state_ *ps = &lstate[mythread];
-
-#pragma omp for //nowait
-      for (size_t i = 0; i < g1.size(0); i += 2)
-      {
-        const int ixmax(std::min<int>(2,g1.size(0)-i));
-        for (size_t j = 0; j < g1.size(1); j += 2)
-        {
-          const int iymax(std::min<int>(2,g1.size(1)-j));
-          for (size_t k = 0; k < g1.size(2); k += 2)
-          {
-            const int izmax(std::min<int>(2,g1.size(2)-k));
-
-            // ARJ - added inner set of loops to speed up evaluation of Panphasia
-            for (int ix = 0; ix < ixmax; ++ix)
-            {
-              for (int iy = 0; iy < iymax; ++iy)
-              {   
-                for (int iz = 0; iz < izmax; ++iz)
-                {
-                  int ilocal = i + ix;
-                  int jlocal = j + iy;
-                  int klocal = k + iz;
-
-                  int iglobal = ilocal + g1.local_0_start_;
-                  int jglobal = jlocal;
-                  int kglobal = klocal;
-
-                  adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
-                                                 &ps->layer_max, &ps->indep_field, &cell_prop[0]);
-
-                  g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
-                  g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
-                  g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
-                  g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
-                }
-              }
-            }
-          }
-        }
-      }
-    } // end omp parallel region
-
-    // music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
-    // tp = get_wtime();
-
-    /////////////////////////////////////////////////////////////////////////
-    // transform and convolve with Legendres
-    g1.FourierTransformForward();
-    g2.FourierTransformForward();
-    g3.FourierTransformForward();
-    g4.FourierTransformForward();
-
-    #pragma omp parallel for 
-    for (size_t i = 0; i < g0.size(0); i++)
-    {
-      for (size_t j = 0; j < g0.size(1); j++)
-      {
-        for (size_t k = 0; k < g0.size(2); k++)
-        {
-          if (!g0.is_nyquist_mode(i, j, k))
-          {
-            auto kvec = g0.get_k<real_t>(i, j, k);
-
-            auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
-            auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
-            auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
-
-            auto fx = real_t(sinc(argx));
-            auto gx = ccomplex_t(0.0, dsinc(argx));
-            auto fy = real_t(sinc(argy));
-            auto gy = ccomplex_t(0.0, dsinc(argy));
-            auto fz = real_t(sinc(argz));
-            auto gz = ccomplex_t(0.0, dsinc(argz));
-
-            auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
-
-            g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
-
-            // do final phase shift to account for corner centered coordinates vs. cell centers
-            auto phase_shift = - 0.5 * M_PI * ( kvec[0] / g0.kny_[0]
-                       + kvec[1] /g0.kny_[1] + kvec[2] / g0.kny_[2]);
-
-            g0.kelem(i, j, k) *= std::exp( ccomplex_t(0,phase_shift) );
-          }
-        }
-      }
-    }
-
-    g1.reset();
-    g2.reset();
-    g3.reset();
-    g4.reset();
-
-    g.allocate();
-    g0.FourierInterpolateCopyTo( g );
-
-    music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
-                          1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
-    music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
  }
 };

+void RNG_panphasia::Run_Panphasia_Highorder(Grid_FFT<real_t> &g)
+{
+  int verbose = 0;
+  int error;
+  size_t x0 = 0, y0 = 0, z0 = 0;
+  size_t rel_level;
+  int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
+
+  //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
+
+  PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose);
+
+  printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_);
+
+  // Choose smallest value of level to equal of exceed grid_res_)
+
+  for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size <<rel_level) <  grid_res_; rel_level++);
+
+  printf("Setting relative level = %lu\n", rel_level);
+
+  if ((error = PANPHASIA2::PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)))
+  {
+    printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
+    abort();
+  };
+
+  //======================= FFTW ==============================
+
+  ptrdiff_t alloc_local, local_n0, local_0_start;
+
+  size_t N0 = fdim * (PANPHASIA2::descriptor_base_size << rel_level);
+
+  alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start);
+
+  Grid_FFT<real_t> pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}});
+
+  _unused(alloc_local);
+
+  assert(pan_grid.n_[0] == N0);
+  assert(pan_grid.n_[1] == N0);
+  assert(pan_grid.n_[2] == N0);
+  assert(pan_grid.local_0_start_ == local_0_start);
+  assert(pan_grid.local_0_size_ == local_n0);
+  assert(pan_grid.ntot_ == size_t(alloc_local));
+
+  pan_grid.FourierTransformForward(false);
+
+  FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast<FFTW_COMPLEX *>(&pan_grid.data_[0]);
+
+  // Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
+
+  if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field)))
+  {
+    music::elog << "Error code from PANPHASIA_compute ...  (ErrCode = " << error << ")" << std::endl;
+  };
+
+  pan_grid.FourierInterpolateCopyTo(g);
+}
 namespace
 {
  RNG_plugin_creator_concrete<RNG_panphasia> creator("PANPHASIA");
 }
-#endif // defined(USE_PANPHASIA)
+#endif // defined(USE_PANPHASIA)