From b567ba8e68f038fee215c0c71d6b6d10e49b20b1 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Sun, 2 May 2021 22:05:18 +0200 Subject: [PATCH 01/25] first commit PANPHASIA_HO --- external/panphasia_ho/PAN_FFTW3.h | 44 + external/panphasia_ho/README | 143 ++ external/panphasia_ho/array.h | 326 +++ external/panphasia_ho/c7_script_threaded | 33 + .../panphasia_ho/features/clangfeatures.h | 93 + .../panphasia_ho/features/compilerfeatures.h | 343 +++ external/panphasia_ho/features/gccfeatures.h | 263 +++ external/panphasia_ho/features/iccfeatures.h | 212 ++ .../panphasia_ho/features/metalfeatures.h | 111 + external/panphasia_ho/features/msvcfeatures.h | 200 ++ external/panphasia_ho/features/nvccfeatures.h | 125 ++ .../panphasia_ho/features/open64features.h | 50 + .../panphasia_ho/features/openclfeatures.h | 89 + external/panphasia_ho/features/pgccfeatures.h | 194 ++ external/panphasia_ho/features/sse.h | 280 +++ .../panphasia_ho/features/sunprofeatures.h | 172 ++ external/panphasia_ho/features/xlcfeatures.h | 210 ++ .../high_order_panphasia_routines.c | 1863 +++++++++++++++++ external/panphasia_ho/main.c | 97 + external/panphasia_ho/makefile | 22 + external/panphasia_ho/pan_matrices_order6.h | 1737 +++++++++++++++ external/panphasia_ho/pan_mpi_routines.c | 344 +++ external/panphasia_ho/panphasia_functions.h | 93 + external/panphasia_ho/threefry.h | 874 ++++++++ .../panphasia_ho/uniform_rand_threefry4x64.c | 1033 +++++++++ 25 files changed, 8951 insertions(+) create mode 100644 external/panphasia_ho/PAN_FFTW3.h create mode 100644 external/panphasia_ho/README create mode 100644 external/panphasia_ho/array.h create mode 100755 external/panphasia_ho/c7_script_threaded create mode 100644 external/panphasia_ho/features/clangfeatures.h create mode 100644 external/panphasia_ho/features/compilerfeatures.h create mode 100644 external/panphasia_ho/features/gccfeatures.h create mode 100644 external/panphasia_ho/features/iccfeatures.h create mode 100644 external/panphasia_ho/features/metalfeatures.h create mode 100644 external/panphasia_ho/features/msvcfeatures.h create mode 100644 external/panphasia_ho/features/nvccfeatures.h create mode 100644 external/panphasia_ho/features/open64features.h create mode 100644 external/panphasia_ho/features/openclfeatures.h create mode 100644 external/panphasia_ho/features/pgccfeatures.h create mode 100644 external/panphasia_ho/features/sse.h create mode 100644 external/panphasia_ho/features/sunprofeatures.h create mode 100644 external/panphasia_ho/features/xlcfeatures.h create mode 100644 external/panphasia_ho/high_order_panphasia_routines.c create mode 100644 external/panphasia_ho/main.c create mode 100644 external/panphasia_ho/makefile create mode 100644 external/panphasia_ho/pan_matrices_order6.h create mode 100644 external/panphasia_ho/pan_mpi_routines.c create mode 100644 external/panphasia_ho/panphasia_functions.h create mode 100644 external/panphasia_ho/threefry.h create mode 100644 external/panphasia_ho/uniform_rand_threefry4x64.c diff --git a/external/panphasia_ho/PAN_FFTW3.h b/external/panphasia_ho/PAN_FFTW3.h new file mode 100644 index 0000000..687d43a --- /dev/null +++ b/external/panphasia_ho/PAN_FFTW3.h @@ -0,0 +1,44 @@ +// Define macros for FFTW3 to allow swapping +// between single/double precision FTs + +#define FOURIER_DOUBLE + +#ifdef FOURIER_DOUBLE + #define FFTW_REAL double + #define FFTW_PLAN fftw_plan + #define FFTW_DESTROY_PLAN fftw_destroy_plan + #define FFTW_COMPLEX fftw_complex + #define FFTW_MALLOC fftw_malloc + #define FFTW_PLAN_DFT_1D fftw_plan_dft_1d + #define FFTW_PLAN_dft_3D fftw_plan_dft_3d + #define FFTW_EXECUTE fftw_execute + #define FFTW_DESTROY_PLAN fftw_destroy_plan + #define FFTW_FREE fftw_free + #define FFTW_ALLOC_COMPLEX fftw_alloc_complex + #define FFTW_MPI_LOCAL_SIZE_MANY fftw_mpi_local_size_many + #define FFTW_PLAN_MANY_DFT fftw_plan_many_dft + #define FFTW_MPI_LOCAL_SIZE_3D fftw_mpi_local_size_3d + #define FFTW_MPI_PLAN_MANY_DTF fftw_mpi_plan_many_dft + #define FFTW_MPI_PLAN_MANY_DTF_R2C fftw_mpi_plan_many_dft_r2c + #define FFTW_MPI_EXECUTE_DFT fftw_mpi_execute_dft + #define FFTW_MPI_EXECUTE_DFT_R2C fftw_mpi_execute_dft_r2c +#else + #define FFTW_REAL float + #define FFTW_PLAN fftwf_plan + #define FFTW_DESTROY_PLAN fftwf_destroy_plan + #define FFTW_COMPLEX fftwf_complex + #define FFTW_MALLOC fftwf_malloc + #define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d + #define FFTW_PLAN_dft_3D fftwf_plan_dft_3d + #define FFTW_EXECUTE fftwf_execute + #define FFTW_DESTROY_PLAN fftwf_destroy_plan + #define FFTW_FREE fftwf_free + #define FFTW_ALLOC_COMPLEX fftwf_alloc_complex + #define FFTW_MPI_LOCAL_SIZE_MANY fftwf_mpi_local_size_many + #define FFTW_PLAN_MANY_DFT fftwf_plan_many_dft + #define FFTW_MPI_LOCAL_SIZE_3D fftwf_mpi_local_size_3d + #define FFTW_MPI_PLAN_MANY_DTF fftwf_mpi_plan_many_dft + #define FFTW_MPI_PLAN_MANY_DTF_R2C fftwf_mpi_plan_many_dft_r2c + #define FFTW_MPI_EXECUTE_DFT fftwf_mpi_execute_dft + #define FFTW_MPI_EXECUTE_DFT_R2C fftwf_mpi_execute_dft_r2c +#endif diff --git a/external/panphasia_ho/README b/external/panphasia_ho/README new file mode 100644 index 0000000..591b1d4 --- /dev/null +++ b/external/panphasia_ho/README @@ -0,0 +1,143 @@ + + +modules on COSMA7 + +intel_comp/2018 fftw/3.3.9cosma7 +intel_mpi/2018 gsl/2.5 + + + + + + + + + + The code calls a function to generate the k-space modes for +a portion of the Panphasia field given an input descriptor. + + + + + Should be called early before significant memory is allocated. It +uses quite a bit of memory itself, but tidies up afterwards. + +Has OpenMP - -DUSE_OPENMP in the makefile + + + + The routines support both single and double precision +calculations in two senses. + +The Fourier computations can be single or double precision + +MACROs FFTW_REAL/FFTW_COMPLEX used to define 'Fourier' precision types + float or double. + +The Panphasia coefficients can be single or double precision + +MACROs PAN_REAL/PAN_COMPLEX define the Panphasia precision - either + float or double. + + +To change the Fourier precision edit PAN_FFTW3.h - by default +single precision unless 'FOURIER_DOUBLE' is defined. + +To change the Panphasia precision edit panphasia_functions.h and +single precision unless 'PAN_DOUBLE_PRECISION' is defined. + + + + + + + + + Code description +------------------- + +makefile + +CODE +---- + + +main.c - demo program only + +pan_mpi_routines.c - contains MPI calls + + + +high_order_panphasia_routines.c - serial - contains some OpenMP +uniform_rand_threefry4x64.c - serial - random generator and tests + +Include files +-------------- + +panphasia_functions.h + +PAN_FFTW3.h - MACROS for single/double precision FTs + +pan_matrices_order6.h - matrix coefficients for 6th order scheme. + + +threefry.h - Random generator +array.h + features array .h files + + + + +Development notes: + +---------------------------------------------------- + +14th April 2021 + + + + Found a bug in the OpenMP version. Different numbers +of threads led to a subset of Fourier modes having +different values. The precise differences changed +each time the code was run. + + Debugged by turning of OpenMP section by section. +The section which uses the spherical bessel functions +turned out to be responsible. + + The faulty version collapsed for 4 loops over +multipole,x,y,z. Changing this to a loop +over multipoles, and collapsing 3 coordinate +loops solved the problem. + +The variable index1 of the return field +does not depend on the multipole, while +index2 does. Both index1 and index2 are +private. This means the return array +(index 1) is updated several times. +Presumably as these updated occur +in parallel with the 4 loop collapsed +version the return array was being +corrupted sometimes. + + +15th April +------------ + + This version supercedes version given to +Oliver to add to MonofonIC clone. + + Main difference is additional OpenMP +statements and the ability to specify +in the descriptor that modes less +than of equal to some dimensionless +integer wavenumber squared are set +to the mean power. + + Tested output on 1 core - with/without +OpenMP. Not tested with more than +1 MPI rank. + + + + + diff --git a/external/panphasia_ho/array.h b/external/panphasia_ho/array.h new file mode 100644 index 0000000..ab85392 --- /dev/null +++ b/external/panphasia_ho/array.h @@ -0,0 +1,326 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _r123array_dot_h__ +#define _r123array_dot_h__ +#include "features/compilerfeatures.h" +#include "features/sse.h" + +#ifndef __cplusplus +#define CXXMETHODS(_N, W, T) +#define CXXOVERLOADS(_N, W, T) +#else + +#include +#include +#include +#include +#include +#include + +/** @defgroup arrayNxW The r123arrayNxW classes + + Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers. + It is functionally equivalent to the C++0x std::array, + but does not require C++0x features or libraries. + + In addition to meeting most of the requirements of a Container, + it also has a member function, incr(), which increments the zero-th + element and carrys overflows into higher indexed elements. Thus, + by using incr(), sequences of up to 2^(N*W) distinct values + can be produced. + + If SSE is supported by the compiler, then the class + r123array1xm128i is also defined, in which the data member is an + array of one r123128i object. + + @cond HIDDEN_FROM_DOXYGEN +*/ + +template +inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){ + value_type v=0; + for(size_t i=0; i<(3+sizeof(value_type))/4; ++i) + v |= ((value_type)(*p32++)) << (32*i); + return v; +} + +// Work-alike methods and typedefs modeled on std::array: +#define CXXMETHODS(_N, W, T) \ + typedef T value_type; \ + typedef T* iterator; \ + typedef const T* const_iterator; \ + typedef value_type& reference; \ + typedef const value_type& const_reference; \ + typedef size_t size_type; \ + typedef ptrdiff_t difference_type; \ + typedef T* pointer; \ + typedef const T* const_pointer; \ + typedef std::reverse_iterator reverse_iterator; \ + typedef std::reverse_iterator const_reverse_iterator; \ + /* Boost.array has static_size. C++11 specializes tuple_size */ \ + enum {static_size = _N}; \ + R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \ + R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \ + R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE size_type size() const { return _N; } \ + R123_CUDA_DEVICE size_type max_size() const { return _N; } \ + R123_CUDA_DEVICE bool empty() const { return _N==0; }; \ + R123_CUDA_DEVICE iterator begin() { return &v[0]; } \ + R123_CUDA_DEVICE iterator end() { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \ + R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \ + R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \ + R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \ + R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \ + R123_CUDA_DEVICE pointer data(){ return &v[0]; } \ + R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \ + R123_CUDA_DEVICE reference front(){ return v[0]; } \ + R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \ + R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \ + R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \ + R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \ + /* CUDA3 does not have std::equal */ \ + for (size_t i = 0; i < _N; ++i) \ + if (v[i] != rhs.v[i]) return false; \ + return true; \ + } \ + R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \ + /* CUDA3 does not have std::fill_n */ \ + R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \ + R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \ + /* CUDA3 does not have std::swap_ranges */ \ + for (size_t i = 0; i < _N; ++i) { \ + T tmp = v[i]; \ + v[i] = rhs.v[i]; \ + rhs.v[i] = tmp; \ + } \ + } \ + R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \ + /* This test is tricky because we're trying to avoid spurious \ + complaints about illegal shifts, yet still be compile-time \ + evaulated. */ \ + if(sizeof(T)>((sizeof(T)3?3:0] is to silence \ + a spurious error from icpc \ + */ \ + ++v[_N>1?1:0]; \ + if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \ + ++v[_N>2?2:0]; \ + if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \ + ++v[_N>3?3:0]; \ + for(size_t i=4; i<_N; ++i){ \ + if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \ + ++v[i]; \ + } \ + return *this; \ + } \ + /* seed(SeedSeq) would be a constructor if having a constructor */ \ + /* didn't cause headaches with defaults */ \ + template \ + R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \ + r123array##_N##x##W ret; \ + const size_t Ngen = _N*((3+sizeof(value_type))/4); \ + uint32_t u32[Ngen]; \ + uint32_t *p32 = &u32[0]; \ + ss.generate(&u32[0], &u32[Ngen]); \ + for(size_t i=0; i<_N; ++i){ \ + ret.v[i] = assemble_from_u32(p32); \ + p32 += (3+sizeof(value_type))/4; \ + } \ + return ret; \ + } \ +protected: \ + R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \ + /* n may be greater than the maximum value of a single value_type */ \ + value_type vtn; \ + vtn = n; \ + v[0] += n; \ + const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \ + for(size_t i=1; i<_N; ++i){ \ + if(rshift){ \ + n >>= rshift; \ + }else{ \ + n=0; \ + } \ + if( v[i-1] < vtn ) \ + ++n; \ + if( n==0 ) break; \ + vtn = n; \ + v[i] += n; \ + } \ + return *this; \ + } \ + + +// There are several tricky considerations for the insertion and extraction +// operators: +// - we would like to be able to print r123array16x8 as a sequence of 16 integers, +// not as 16 bytes. +// - we would like to be able to print r123array1xm128i. +// - we do not want an int conversion operator in r123m128i because it causes +// lots of ambiguity problems with automatic promotions. +// Solution: r123arrayinsertable and r123arrayextractable + +template +struct r123arrayinsertable{ + const T& v; + r123arrayinsertable(const T& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << t.v; + } +}; + +template<> +struct r123arrayinsertable{ + const uint8_t& v; + r123arrayinsertable(const uint8_t& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << (int)t.v; + } +}; + +template +struct r123arrayextractable{ + T& v; + r123arrayextractable(T& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + return is >> t.v; + } +}; + +template<> +struct r123arrayextractable{ + uint8_t& v; + r123arrayextractable(uint8_t& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + int i; + is >> i; + t.v = i; + return is; + } +}; + +#define CXXOVERLOADS(_N, W, T) \ + \ +inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \ + os << r123arrayinsertable(a.v[0]); \ + for(size_t i=1; i<_N; ++i) \ + os << " " << r123arrayinsertable(a.v[i]); \ + return os; \ +} \ + \ +inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \ + for(size_t i=0; i<_N; ++i){ \ + r123arrayextractable x(a.v[i]); \ + is >> x; \ + } \ + return is; \ +} \ + \ +namespace r123{ \ + typedef r123array##_N##x##W Array##_N##x##W; \ +} + +#endif /* __cplusplus */ + +/* _r123array_tpl expands to a declaration of struct r123arrayNxW. + + In C, it's nothing more than a struct containing an array of N + objects of type T. + + In C++ it's the same, but endowed with an assortment of member + functions, typedefs and friends. In C++, r123arrayNxW looks a lot + like std::array, has most of the capabilities of a container, + and satisfies the requirements outlined in compat/Engine.hpp for + counter and key types. ArrayNxW, in the r123 namespace is + a typedef equivalent to r123arrayNxW. +*/ + +#define _r123array_tpl(_N, W, T) \ + /** @ingroup arrayNxW */ \ + /** @see arrayNxW */ \ +struct r123array##_N##x##W{ \ + T v[_N]; \ + CXXMETHODS(_N, W, T) \ +}; \ + \ +CXXOVERLOADS(_N, W, T) + +/** @endcond */ + +_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */ +_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */ +_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */ +_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */ + +_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */ +_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */ +_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */ + +_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */ + +#if R123_USE_SSE +_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */ +#endif + +/* In C++, it's natural to use sizeof(a::value_type), but in C it's + pretty convoluted to figure out the width of the value_type of an + r123arrayNxW: +*/ +#define R123_W(a) (8*sizeof(((a *)0)->v[0])) + +/** @namespace r123 + Most of the Random123 C++ API is contained in the r123 namespace. +*/ + +#endif + diff --git a/external/panphasia_ho/c7_script_threaded b/external/panphasia_ho/c7_script_threaded new file mode 100755 index 0000000..5817ec7 --- /dev/null +++ b/external/panphasia_ho/c7_script_threaded @@ -0,0 +1,33 @@ +#!/bin/bash -l + +#SBATCH --ntasks 5 +#SBATCH -J Test_MPI_FFTW +#SBATCH -o standard_output_file.%J.out +#SBATCH -e standard_error_file.%J.err +#SBATCH -p cosma7 +#SBATCH -A dp004 +#SBATCH --exclusive +#SBATCH -t 00:05:00 +#SBATCH --mail-type=END # notifications for job +#SBATCH --mail-user=a.r.jenkins@durham.ac.uk + +module purge +module load intel_comp/2018 intel_mpi/2018 fftw/3.3.9cosma7 gsl/2.5 hdf5/1.8.20 + + +# Run the program + + + +mpirun -l -env I_MPI_PIN=1 -env I_MPI_PIN_PROCESSOR_LIST=allcores -n $SLURM_NTASKS ./pan_fftw3_test_code.x + + + + + + + + + + + diff --git a/external/panphasia_ho/features/clangfeatures.h b/external/panphasia_ho/features/clangfeatures.h new file mode 100644 index 0000000..1e3c8cf --- /dev/null +++ b/external/panphasia_ho/features/clangfeatures.h @@ -0,0 +1,93 @@ +/* +Copyright 2010-2016, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __clangfeatures_dot_hpp +#define __clangfeatures_dot_hpp + +#ifndef R123_USE_X86INTRIN_H +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_X86INTRIN_H 1 +#else +#define R123_USE_X86INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert) +#endif + +// With clang-3.6, -Wall warns about unused-local-typedefs. +// The "obvious" thing to do is to ignore -Wunused-local-typedefs, +// but that doesn't work because earlier versions of clang blow +// up on an 'unknown warning group'. So we briefly ignore -Wall... +// It's tempting to just give up on static assertions in pre-c++11 code. +#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT) +#define R123_STATIC_ASSERT(expr, msg) \ +_Pragma("clang diagnostic push") \ +_Pragma("clang diagnostic ignored \"-Wall\"") \ +typedef char static_assertion[(!!(expr))*2-1] \ +_Pragma("clang diagnostic pop") +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions) +#endif + +// With clang-3.0, the apparently simpler: +// #define R123_USE_CXX11_RANDOM __has_include() +// dumps core. +#ifndef R123_USE_CXX11_RANDOM +#if __cplusplus>=201103L && __has_include() +#define R123_USE_CXX11_RANDOM 1 +#else +#define R123_USE_CXX11_RANDOM 0 +#endif +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#if __cplusplus>=201103L && __has_include() +#define R123_USE_CXX11_TYPE_TRAITS 1 +#else +#define R123_USE_CXX11_TYPE_TRAITS 0 +#endif +#endif + +#include "gccfeatures.h" + +#endif diff --git a/external/panphasia_ho/features/compilerfeatures.h b/external/panphasia_ho/features/compilerfeatures.h new file mode 100644 index 0000000..0606dee --- /dev/null +++ b/external/panphasia_ho/features/compilerfeatures.h @@ -0,0 +1,343 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/** + +@page porting Preprocessor symbols for porting Random123 to different platforms. + +The Random123 library is portable across C, C++, CUDA, OpenCL environments, +and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris). +This level of portability requires the abstraction of some features +and idioms that are either not standardized (e.g., asm statments), or for which +different vendors have their own standards (e.g., SSE intrinsics) or for +which vendors simply refuse to conform to well-established standards (e.g., ). + +Random123/features/compilerfeatures.h +conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which +defines appropriate values for the preprocessor symbols which can be used with +a specific compiler or OS. Those symbols will then +be used by other header files and source files in the Random123 +library (and may be used by applications) to control what actually +gets presented to the compiler. + +Most of the symbols are boolean valued. In general, they will +\b always be defined with value either 1 or 0, so do +\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead. + +Library users can override any value by defining the pp-symbol with a compiler option, +e.g., + + cc -DR123_USE_MULHILO64_C99 + +will use a strictly c99 version of the full-width 64x64->128-bit multiplication +function, even if it would be disabled by default. + +All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_ +@verbatim + AES_NI + AES_OPENSSL + SSE4_2 + SSE4_1 + SSE + + STD_RANDOM + + GNU_UINT128 + ASM_GNU + ASM_MSASM + + CPUID_MSVC + + CXX11_RANDOM + CXX11_TYPE_TRAITS + CXX11_STATIC_ASSERT + CXX11_CONSTEXPR + CXX11_UNRESTRICTED_UNIONS + CXX11_EXPLICIT_CONVERSIONS + CXX11_LONG_LONG + CXX11_STD_ARRAY + CXX11 + + X86INTRIN_H + IA32INTRIN_H + XMMINTRIN_H + EMMINTRIN_H + SMMINTRIN_H + WMMINTRIN_H + INTRIN_H + + MULHILO32_ASM + MULHILO64_ASM + MULHILO64_MSVC_INTRIN + MULHILO64_CUDA_INTRIN + MULHILO64_OPENCL_INTRIN + MULHILO64_C99 + + U01_DOUBLE + +@endverbatim +Most have obvious meanings. Some non-obvious ones: + +AES_NI and AES_OPENSSL are not mutually exclusive. You can have one, +both or neither. + +GNU_UINT128 says that it's safe to use __uint128_t, but it +does not require its use. In particular, it should be +used in mulhilo only if MULHILO64_ASM is unset. + +If the XXXINTRIN_H macros are true, then one should +@code +#include +@endcode +to gain accesss to compiler intrinsics. + +The CXX11_SOME_FEATURE macros allow the code to use specific +features of the C++11 language and library. The catchall +In the absence of a specific CXX11_SOME_FEATURE, the feature +is controlled by the catch-all R123_USE_CXX11 macro. + +U01_DOUBLE defaults on, and can be turned off (set to 0) +if one does not want the utility functions that convert to double +(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension. + +There are a number of invariants that are always true. Application code may +choose to rely on these: + +
    +
  • ASM_GNU and ASM_MASM are mutually exclusive +
  • The "higher" SSE values imply the lower ones. +
+ +There are also non-boolean valued symbols: + +
    +
  • R123_STATIC_INLINE - + According to both C99 and GNU99, the 'static inline' declaration allows + the compiler to not emit code if the function is not used. + Note that the semantics of 'inline', 'static' and 'extern' in + gcc have changed over time and are subject to modification by + command line options, e.g., -std=gnu89, -fgnu-inline. + Nevertheless, it appears that the meaning of 'static inline' + has not changed over time and (with a little luck) the use of 'static inline' + here will be portable between versions of gcc and to other C99 + compilers. + See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html + http://www.greenend.org.uk/rjk/2003/03/inline.html + +
  • R123_FORCE_INLINE(decl) - + which expands to 'decl', adorned with the compiler-specific + embellishments to strongly encourage that the declared function be + inlined. If there is no such compiler-specific magic, it should + expand to decl, unadorned. + +
  • R123_CUDA_DEVICE - which expands to __device__ (or something else with + sufficiently similar semantics) when CUDA is in use, and expands + to nothing in other cases. + +
  • R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or + something else with sufficiently similar semantics) when compiling a + Metal kernel, and expands to nothing in other cases. + +
  • R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at + all if we're in an environment so feature-poor that you can't even + call assert (I'm looking at you, CUDA and OpenCL), or even include + assert.h safely (OpenCL). + +
  • R123_STATIC_ASSERT(expr,msg) - which expands to + static_assert(expr,msg), or to an expression that + will raise a compile-time exception if expr is not true. + +
  • R123_ULONG_LONG - which expands to a declaration of the longest available + unsigned integer. + +
  • R123_64BIT(x) - expands to something equivalent to + UINT64_C(x) from , even in environments where + is not available, e.g., MSVC and OpenCL. + +
  • R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with + the semantics of gcc's __builtin_expect(expr,likely_value). If + the environment has nothing like __builtin_expect, it should expand + to just expr. +
+ + +\cond HIDDEN_FROM_DOXYGEN +*/ + +/* +N.B. When something is added to the list of features, it should be +added to each of the *features.h files, AND to examples/ut_features.cpp. +*/ + +/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */ +#if defined(__METAL_MACOS__) +#include "metalfeatures.h" +#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0 +#include "openclfeatures.h" +#elif defined(__CUDACC__) +#include "nvccfeatures.h" +#elif defined(__ICC) +#include "iccfeatures.h" +#elif defined(__xlC__) || defined(__ibmxl__) +#include "xlcfeatures.h" +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#include "sunprofeatures.h" +#elif defined(__OPEN64__) +#include "open64features.h" +#elif defined(__clang__) +#include "clangfeatures.h" +#elif defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(__PGI) +#include "pgccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#else +#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp" +{ /* maybe an unbalanced brace will terminate the compilation */ +#endif + +#ifndef R123_USE_CXX11 +#define R123_USE_CXX11 (__cplusplus >= 201103L) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_LONG_LONG +#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_STD_ARRAY +#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11 +#endif + +#ifndef R123_USE_MULHILO64_C99 +#define R123_USE_MULHILO64_C99 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_STATIC_ASSERT +#if R123_USE_CXX11_STATIC_ASSERT +#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg) +#else + /* if msg always_looked_like_this, we could paste it into the name. Worth it? */ +#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1] +#endif +#endif + +#ifndef R123_CONSTEXPR +#if R123_USE_CXX11_CONSTEXPR +#define R123_CONSTEXPR constexpr +#else +#define R123_CONSTEXPR +#endif +#endif + +#ifndef R123_USE_64BIT +#define R123_USE_64BIT 1 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)) +#endif + +#ifndef R123_ULONG_LONG +#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG +/* C++98 doesn't have long long. It doesn't have uint64_t either, but + we will have typedef'ed uint64_t to something in the xxxfeatures.h. + With luck, it won't elicit complaints from -pedantic. Cross your + fingers... */ +#define R123_ULONG_LONG uint64_t +#else +#define R123_ULONG_LONG unsigned long long +#endif +#endif + +/* UINT64_C should have been #defined by XXXfeatures.h, either by + #include or through compiler-dependent hacks */ +#ifndef R123_64BIT +#define R123_64BIT(x) UINT64_C(x) +#endif + +#ifndef R123_THROW +#define R123_THROW(x) throw (x) +#endif + +#ifndef R123_METAL_THREAD_ADDRESS_SPACE +#define R123_METAL_THREAD_ADDRESS_SPACE +#endif + +#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE +#define R123_METAL_CONSTANT_ADDRESS_SPACE +#endif + +/* + * Windows.h (and perhaps other "well-meaning" code define min and + * max, so there's a high chance that our definition of min, max + * methods or use of std::numeric_limits min and max will cause + * complaints in any program that happened to include Windows.h or + * suchlike first. We use the null macro below in our own header + * files definition or use of min, max to defensively preclude + * this problem. It may not be enough; one might need to #define + * NOMINMAX before including Windows.h or compile with -DNOMINMAX. + */ +#define R123_NO_MACRO_SUBST + +/** \endcond */ diff --git a/external/panphasia_ho/features/gccfeatures.h b/external/panphasia_ho/features/gccfeatures.h new file mode 100644 index 0000000..2454a1d --- /dev/null +++ b/external/panphasia_ho/features/gccfeatures.h @@ -0,0 +1,263 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __gccfeatures_dot_hpp +#define __gccfeatures_dot_hpp + +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390x__) +# error "This code has only been tested on x86, powerpc and a few arm platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __powerpc__ +#include +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline__ +#endif + +#ifndef R123_FORCE_INLINE +#if R123_GNUC_VERSION >= 40000 +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#else +#define R123_FORCE_INLINE(decl) decl +#endif +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +/* According to the C++0x standard, we should be able to test the numeric + value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11 + But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773 + since early 2001, which was finally fixed in 4.7 (early 2012). For + earlier versions, the only way to detect whether --std=c++0x was requested + on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol. +*/ +#if defined(__GCC_EXPERIMENTAL_CXX0X__) +#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */)) +#else +#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */)) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11) +#endif + +#ifndef R123_USE_AES_NI +#ifdef __AES__ +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#if defined(__x86_64__) || defined(__aarch64__) +#define R123_USE_GNU_UINT128 1 +#else +#define R123_USE_GNU_UINT128 0 +#endif +#endif + +#ifndef R123_USE_ASM_GNU +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_ASM_GNU 1 +#else +#define R123_USE_ASM_GNU 1 +#endif +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#if (defined(__x86_64__)||defined(__i386__)) +#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402) +#else +#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402) +#endif +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have + emmintrin.h in the include search path. This is + so broken that I refuse to try to work around it. If this + affects you, figure out where your emmintrin.h lives and + add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */ +#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#if (defined(__powerpc64__)) +#define R123_USE_MULHILO64_MULHI_INTRIN 1 +#else +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/external/panphasia_ho/features/iccfeatures.h b/external/panphasia_ho/features/iccfeatures.h new file mode 100644 index 0000000..7e72dec --- /dev/null +++ b/external/panphasia_ho/features/iccfeatures.h @@ -0,0 +1,212 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __icpcfeatures_dot_hpp +#define __icpcfeatures_dot_hpp + +// icc relies on gcc libraries and other toolchain components. +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Easy123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_NI +// Unlike gcc, icc (version 12) does not pre-define an __AES__ +// pp-symbol when -maes or -xHost is on the command line. This feels +// like a defect in icc (it defines __SSE4_2__ in analogous +// circumstances), but until Intel fixes it, we're better off erring +// on the side of caution and not generating instructions that are +// going to raise SIGILL when executed. To get the AES-NI +// instructions with icc, the caller must puts something like +// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the +// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from +// 11.1 onwards. +// +#if defined(__AES__) +#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/) +#else +#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/) +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 1 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/external/panphasia_ho/features/metalfeatures.h b/external/panphasia_ho/features/metalfeatures.h new file mode 100644 index 0000000..bafe51a --- /dev/null +++ b/external/panphasia_ho/features/metalfeatures.h @@ -0,0 +1,111 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * Written by Tom Schoonjans + */ + +#ifndef __metalfeatures_dot_hpp +#define __metalfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_METAL_THREAD_ADDRESS_SPACE +#define R123_METAL_THREAD_ADDRESS_SPACE thread +#endif + +#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE +#define R123_METAL_CONSTANT_ADDRESS_SPACE constant +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 1 +#endif + +#if R123_USE_MULHILO32_MULHI_INTRIN +#include +#define R123_MULHILO32_MULHI_INTRIN metal::mulhi +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_64BIT +#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */ +#endif + +#ifndef R123_ULONG_LONG +/* the longest integer type in Metal (Feb 2019, Specification-2) is a + * 32-bit unsigned int. Let's hope for the best... */ +#define R123_ULONG_LONG unsigned int +#endif + +#endif diff --git a/external/panphasia_ho/features/msvcfeatures.h b/external/panphasia_ho/features/msvcfeatures.h new file mode 100644 index 0000000..9eb9520 --- /dev/null +++ b/external/panphasia_ho/features/msvcfeatures.h @@ -0,0 +1,200 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __msvcfeatures_dot_hpp +#define __msvcfeatures_dot_hpp + +//#if _MSVC_FULL_VER <= 15 +//#error "We've only tested MSVC_FULL_VER==15." +//#endif + +#if !defined(_M_IX86) && !defined(_M_X64) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Random123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) _forceinline decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#if defined(_M_X64) +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#if defined(_M_X64) +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#if defined(_M_X64) +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 1 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 1 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 1 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#if defined(_M_X64) +#define R123_USE_MULHILO64_MSVC_INTRIN 1 +#else +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +#pragma warning(disable:4244) +#pragma warning(disable:4996) + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/external/panphasia_ho/features/nvccfeatures.h b/external/panphasia_ho/features/nvccfeatures.h new file mode 100644 index 0000000..d1ff8bf --- /dev/null +++ b/external/panphasia_ho/features/nvccfeatures.h @@ -0,0 +1,125 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_nvcc_features_dot_h__ +#define __r123_nvcc_features_dot_h__ + +#if !defined(CUDART_VERSION) +#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined" +#endif + +#if CUDART_VERSION < 4010 +#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h" +// This test was added in Random123-1.08 (August, 2013) because we +// discovered that Ftype(maxTvalue()) with Ftype=double and +// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and +// earlier. We can't be sure this bug doesn't also affect invocations +// of other templated functions, e.g., essentially all of Random123. +// Thus, we no longer trust CUDA versions earlier than 4.1 even though +// we had previously tested and timed Random123 with CUDA 3.x and 4.0. +// If you feel lucky or desperate, you can change #error to #warning, but +// please take extra care to be sure that you are getting correct +// results. +#endif + +// nvcc falls through to gcc or msvc. So first define +// a couple of things and then include either gccfeatures.h +// or msvcfeatures.h + +//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled +//for both device and host functions in CUDA by setting compiler flags +//for the device function +#ifdef __CUDA_ARCH__ +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE __device__ +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 1 +#endif + +#ifndef R123_THROW +// No exceptions in CUDA, at least upto 4.0 +#define R123_THROW(x) R123_ASSERT(0) +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) if((x)) ; else asm("trap;") +#endif + +#else // ! __CUDA_ARCH__ +// If we're using nvcc not compiling for the CUDA architecture, +// then we must be compiling for the host. In that case, +// tell the philox code to use the mulhilo64 asm because +// nvcc doesn't grok uint128_t. +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#endif // __CUDA_ARCH__ + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_ULONG_LONG +// uint64_t, which is what we'd get without this, is +// not the same as unsigned long long +#define R123_ULONG_LONG unsigned long long +#endif + +#if defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#endif + +#endif diff --git a/external/panphasia_ho/features/open64features.h b/external/panphasia_ho/features/open64features.h new file mode 100644 index 0000000..8da9f5f --- /dev/null +++ b/external/panphasia_ho/features/open64features.h @@ -0,0 +1,50 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __open64features_dot_hpp +#define __open64features_dot_hpp + +/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */ + +/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint + but produce incorrect code for 64-bit philox. The MULHILO64_ASM + seems to work fine */ +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#include "gccfeatures.h" + +#endif diff --git a/external/panphasia_ho/features/openclfeatures.h b/external/panphasia_ho/features/openclfeatures.h new file mode 100644 index 0000000..af03d30 --- /dev/null +++ b/external/panphasia_ho/features/openclfeatures.h @@ -0,0 +1,89 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __openclfeatures_dot_hpp +#define __openclfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 1 +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of +// ulong to mul_hi. And gets lots of complaints from stdint.h +// on some machines. +// But these typedefs mean we cannot include stdint.h with +// these headers? Do we need R123_64T, R123_32T, R123_8T? +typedef ulong uint64_t; +typedef uint uint32_t; +typedef uchar uint8_t; +#define UINT64_C(x) ((ulong)(x##UL)) + +#endif diff --git a/external/panphasia_ho/features/pgccfeatures.h b/external/panphasia_ho/features/pgccfeatures.h new file mode 100644 index 0000000..18ace13 --- /dev/null +++ b/external/panphasia_ho/features/pgccfeatures.h @@ -0,0 +1,194 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __pgccfeatures_dot_hpp +#define __pgccfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +/* Found this example in PGI's emmintrin.h. */ +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) (expr) +#endif + +/* PGI through 13.2 doesn't appear to support AES-NI. */ +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and + ABM, but not SSE4.1 or SSE4.2. */ +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time + about undefined references to _mm_castsi128_ps(__m128i). Why? */ +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 1 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/external/panphasia_ho/features/sse.h b/external/panphasia_ho/features/sse.h new file mode 100644 index 0000000..3a49ebd --- /dev/null +++ b/external/panphasia_ho/features/sse.h @@ -0,0 +1,280 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _Random123_sse_dot_h__ +#define _Random123_sse_dot_h__ + +#if R123_USE_SSE + +#if R123_USE_X86INTRIN_H +#include +#endif +#if R123_USE_IA32INTRIN_H +#include +#endif +#if R123_USE_XMMINTRIN_H +#include +#endif +#if R123_USE_EMMINTRIN_H +#include +#endif +#if R123_USE_SMMINTRIN_H +#include +#endif +#if R123_USE_WMMINTRIN_H +#include +#endif +#if R123_USE_INTRIN_H +#include +#endif +#ifdef __cplusplus +#include +#include +#include +#endif + +#if R123_USE_ASM_GNU + +/* bit25 of CX tells us whether AES is enabled. */ +R123_STATIC_INLINE int haveAESNI(){ + unsigned int eax, ebx, ecx, edx; + __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : + "a" (1)); + return (ecx>>25) & 1; +} +#elif R123_USE_CPUID_MSVC +R123_STATIC_INLINE int haveAESNI(){ + int CPUInfo[4]; + __cpuid(CPUInfo, 1); + return (CPUInfo[2]>>25)&1; +} +#else /* R123_USE_CPUID_??? */ +#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false" +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */ + +// There is a lot of annoying and inexplicable variation in the +// SSE intrinsics available in different compilation environments. +// The details seem to depend on the compiler, the version and +// the target architecture. Rather than insisting on +// R123_USE_feature tests for each of these in each of the +// compilerfeatures.h files we just keep the complexity localized +// to here... +#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64)) +/* Is there an intrinsic to assemble an __m128i from two 64-bit words? + If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel + added _mm_set_epi64x to icc version 12.1 in Jan 2012. +*/ +R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){ + union{ + uint64_t u64; + uint32_t u32[2]; + } u1, u0; + u1.u64 = v1; + u0.u64 = v0; + return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]); +} +#endif +/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit + word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job + on 64-bit platforms. Unfortunately, both MSVC and Open64 fail + assertions in ut_M128.cpp and ut_carray.cpp when we use the + _mm_cvtsi128_si64 intrinsic. (See + https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug). + On 32-bit platforms, there's no MOVQ, so there's no intrinsic. + Finally, even if the intrinsic exists, it may be spelled with or + without the 'x'. +*/ +#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_store_si128(&u.m, si); + return u.u64[0]; +} +#elif defined(__llvm__) || defined(__ICC) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64(si); +} +#else /* GNUC, others */ +/* FWIW, gcc's emmintrin.h has had the 'x' spelling + since at least gcc-3.4.4. The no-'x' spelling showed up + around 4.2. */ +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64x(si); +} +#endif +#if defined(__GNUC__) && __GNUC__ < 4 +/* the cast builtins showed up in gcc4. */ +R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){ + return (__m128)si; +} +#endif + +#ifdef __cplusplus + +struct r123m128i{ + __m128i m; +#if R123_USE_CXX11_UNRESTRICTED_UNIONS + // C++98 forbids a union member from having *any* constructors. + // C++11 relaxes this, and allows union members to have constructors + // as long as there is a "trivial" default construtor. So in C++11 + // we can provide a r123m128i constructor with an __m128i argument, and still + // have the default (and hence trivial) default constructor. + r123m128i() = default; + r123m128i(__m128i _m): m(_m){} +#endif + r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;} + r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;} +#if R123_USE_CXX11_EXPLICIT_CONVERSIONS + // With C++11 we can attach explicit to the bool conversion operator + // to disambiguate undesired promotions. For g++, this works + // only in 4.5 and above. + explicit operator bool() const {return _bool();} +#else + // Pre-C++11, we have to do something else. Google for the "safe bool" + // idiom for other ideas... + operator const void*() const{return _bool()?this:0;} +#endif + operator __m128i() const {return m;} + +private: +#if R123_USE_SSE4_1 + bool _bool() const{ return !_mm_testz_si128(m,m); } +#else + bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); } +#endif +}; + +R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){ + __m128i& c = v.m; + __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1)); + c = _mm_add_epi64(c, zeroone); + //return c; +#if R123_USE_SSE4_1 + __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0))); + if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){ + __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0)); + c = _mm_add_epi64(c, onezero); + } +#else + unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128()))); + // The low two bits of mask are 11 iff the low 64 bits of + // c are zero. + if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){ + __m128i onezero = _mm_set_epi64x(1,0); + c = _mm_add_epi64(c, onezero); + } +#endif + return v; +} + +R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ + __m128i c = lhs.m; + __m128i incr128 = _mm_set_epi64x(0, n); + c = _mm_add_epi64(c, incr128); + // return c; // NO CARRY! + + int64_t lo64 = _mm_extract_lo64(c); + if((uint64_t)lo64 < n) + c = _mm_add_epi64(c, _mm_set_epi64x(1,0)); + lhs.m = c; + return lhs; +} + +// We need this one because it's present, but never used in r123array1xm128i::incr +R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){ + throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");} + +// The comparisons aren't implemented, but if we leave them out, and +// somebody writes, e.g., M1 < M2, the compiler will do an implicit +// conversion through void*. Sigh... +R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");} + +R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ + return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); } +R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){ + r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; } +R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_storeu_si128(&u.m, m.m); + return os << u.u64[0] << " " << u.u64[1]; +} + +R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){ + uint64_t u64[2]; + is >> u64[0] >> u64[1]; + m.m = _mm_set_epi64x(u64[1], u64[0]); + return is; +} + +template inline T assemble_from_u32(uint32_t *p32); // forward declaration + +template <> +inline r123m128i assemble_from_u32(uint32_t *p32){ + r123m128i ret; + ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]); + return ret; +} + +#else + +typedef struct { + __m128i m; +} r123m128i; + +#endif /* __cplusplus */ + +#else /* !R123_USE_SSE */ +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_SSE */ + +#endif /* _Random123_sse_dot_h__ */ diff --git a/external/panphasia_ho/features/sunprofeatures.h b/external/panphasia_ho/features/sunprofeatures.h new file mode 100644 index 0000000..c9cdc00 --- /dev/null +++ b/external/panphasia_ho/features/sunprofeatures.h @@ -0,0 +1,172 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __sunprofeatures_dot_hpp +#define __sunprofeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/external/panphasia_ho/features/xlcfeatures.h b/external/panphasia_ho/features/xlcfeatures.h new file mode 100644 index 0000000..ccb98ee --- /dev/null +++ b/external/panphasia_ho/features/xlcfeatures.h @@ -0,0 +1,210 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __xlcfeatures_dot_hpp +#define __xlcfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) +# error "This code has only been tested on x86 and PowerPC platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __cplusplus +/* builtins are automatically available to xlc. To use them with xlc++, + one must include builtins.h. c.f + http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html +*/ +#include +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#if (defined(__powerpc64__)) +#define R123_USE_MULHILO64_MULHI_INTRIN 1 +#else +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef R123_USE_MULHILO64_ASM +#if defined(__powerpc64__) +#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#else +#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#endif +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c new file mode 100644 index 0000000..1e17816 --- /dev/null +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -0,0 +1,1863 @@ +#include +#include +#include +#include +#include +#include +#include "PAN_FFTW3.h" +#include "panphasia_functions.h" +#include +#include + +#ifdef USE_OPENMP +#include +#endif + +int verbose_warnings_only=0; +static int start_panph_method = 0; +static int panphasia_rel_origin_set = 0; + +// Record descriptor parameters // + +size_t descriptor_order; +size_t descriptor_base_level; +size_t descriptor_xorigin,descriptor_yorigin,descriptor_zorigin; +size_t descriptor_base_size; +size_t descriptor_kk_limit; +long long int descriptor_check_digit; +char descriptor_name[100]; +char full_descriptor[300]; + +size_t descriptor_read_in; + +// Record relative coordinates for a particular descriptor + +size_t rel_level; +size_t rel_origin_x, rel_origin_y, rel_origin_z; +size_t rel_coord_max; + + +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +// +// Matrix operations to solve for individual octree cells +// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// + +#ifdef MATRICES_ORDER_1 +#include "pan_matrices_order1.h" +#elif MATRICES_ORDER_2 +#include "pan_matrices_order2.h" +#elif MATRICES_ORDER_3 +#include "pan_matrices_order3.h" +#elif MATRICES_ORDER_4 +#include "pan_matrices_order4.h" +#elif MATRICES_ORDER_5 +#include "pan_matrices_order5.h" +#elif MATRICES_ORDER_0 +#include "pan_matrices_order0.h" +#else +#include "pan_matrices_order6.h" +#endif + + + + + +void solve_panphasia_cell_(PAN_REAL * input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_vec_children, int control_flag) +{ + + int iparity, iconst, irow, l,i,j; + PAN_REAL element; + PAN_REAL const norm = sqrt(0.125); + + PAN_REAL parent_constraint[Nbasis]; //__assume_aligned(&parent_constraint, 64); + PAN_REAL proj_constraint[Nbasis]; //__assume_aligned(&proj_constraint, 64); + PAN_REAL work_vec1[8*Nbasis]; //__assume_aligned(&work_vec1, 64); + PAN_REAL work_vec2[8*Nbasis]; //__assume_aligned(&work_vec2, 64); + + +//=========================================================================== +// Copy inputs and rearrange parent constraints in parity order and set proj_constraint to zero + + for (i=0; i<8*Nbasis; i++) work_vec1[i]=input_vec_children[i]; + + for (i=0; i17) { //ARJ for testing purposes only + // for(int i=0; i<8*Nbasis;i++)gauss_rand_children[i]=0; //ARJ for testing purposes only + // for(int i=56; i + + + +int demo_descriptor_(){ + + + char str[200] = "[Panph6,L11,(2043,2045,2046),S5,CH-999,Testing_only]"; // xyz + + + //char str[200] = "[Panph6,L3,(2,3,4),S8,CH-999,Testing_only]"; // xyz + + //char str[200] = "[Panph6,L3,(4,2,3),S8,CH-999,Testing_only]"; // xyz + //char str[200] = "[Panph6,L3,(3,4,5),S8,CH-999,Testing_only]"; // xyz + + // char str[200] = "[Panph6,L56,(0,0,31),S5,CH-999,Testing_only]"; + // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH2414478110,Auriga_volume2]"; + // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH-999,Auriga_volume2]"; + + char copy[200]; + const char s[20] = "[,L,(),S,CH,]"; + char *token; + + + size_t desc_level, desc_x, desc_y, desc_z,desc_size; + long long int desc_ch; + char desc_name[100]; + char desc_iden[8]; + int error_code; + + descriptor_read_in = 0; + + if (error_code = parse_and_validate_descriptor_(str)){ + + printf("Invalid descriptor %s\n",str); + printf("Descriptor error code %d\n",error_code); + } else { + printf("Valid descriptor parsed %s\n",str); + }; + + if (descriptor_read_in){ + printf("-----------------------------------------\n"); + printf("Descriptor order: %llu\n",descriptor_order); + printf("Descriptor base level: %llu\n",descriptor_base_level); + printf("Descriptor x-origin: %llu\n",descriptor_xorigin); + printf("Descriptor y-origin: %llu\n",descriptor_yorigin); + printf("Descriptor z-origin: %llu\n",descriptor_zorigin); + printf("Descriptor base size: %llu\n",descriptor_base_size); + printf("Descriptor check digit:%lld\n",descriptor_check_digit); + printf("Descriptor name %s\n", descriptor_name); + printf("-----------------------------------------\n"); + printf("Descriptor %s\n",full_descriptor); + printf("-----------------------------------------\n"); + + printf("Check digit %lld\n",compute_check_digit_()); + }; + + int verbose=0; + int flag_output_mode=0; + PANPHASIA_init_descriptor_(str,&verbose); + + size_t rel_lev = 3; + + size_t rel_orig_x = 33; //xyz + size_t rel_orig_y = 17; + size_t rel_orig_z = 9; + + //size_t rel_orig_x = 9; //zxy + //size_t rel_orig_y = 33; + //size_t rel_orig_z = 17; + + + // size_t rel_orig_x = 0; + // size_t rel_orig_y = 0; + // size_t rel_orig_z = 0; + + verbose = 0; + + if (error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x,&rel_orig_y,&rel_orig_z,&verbose)){ + printf("Error %d in initialing PANPHASIA_init_level_\n", + error_code); + return(error_code); + }; + + + size_t xstart = 3, ystart = 5, zstart = 4; + size_t xextent = 27, yextent = 29, zextent=40; // xyz + + + // size_t xstart = 4, ystart = 3, zstart = 5; + // size_t xextent = 40, yextent = 27, zextent=29; + + // size_t xstart = 0, ystart = 0, zstart = 0; + // size_t xextent = 4, yextent = 4, zextent=4; + + size_t copy_list[Nbasis]; + size_t ncopy=28; + + PAN_REAL *output_values = malloc(sizeof(PAN_REAL)*ncopy*xextent*yextent*zextent); + if (output_values==NULL){ + printf("Unable to allocate output_values \n"); + abort(); + }; + + + + for (int i=0; i0 - Do N iterations of the test with 1. + // In May 2020 ran with N=8000 - all tested passed. + // This provides a good test that the doubly periodic + // boundaries (of Panphasia itself, and the region + // covered by the descriptor) are working correctly. + + + test_propogation_of_moments_(0); + + + printf("===================================================\n"); + printf("Test of Threefry4x64 generator function - PASSED\n"); + printf("Test of inverse Threefry4x64 function - PASSED\n"); + printf("Test of propogation of moments - PASSED\n"); + printf("===================================================\n"); + + panphasia_rel_origin_set = 0; // Force user to set rel origin themselves. +}; + +int PANPHASIA_init_level_(size_t *rel_lev, + size_t *rel_orig_x, size_t *rel_orig_y, + size_t *rel_orig_z, int *verbose){ + + + if (*rel_lev>63) return(101); + if (descriptor_base_level+*rel_lev>63) return (102); + + if (*rel_orig_x>=(descriptor_base_size<<*rel_lev)) return(103); + if (*rel_orig_y>=(descriptor_base_size<<*rel_lev)) return(104); + if (*rel_orig_z>=(descriptor_base_size<<*rel_lev)) return(105); + + // Copy to global set of relative coordinates + + rel_level = *rel_lev; + rel_origin_x = *rel_orig_x; + rel_origin_y = *rel_orig_y; + rel_origin_z = *rel_orig_z; + rel_coord_max= descriptor_base_size<<*rel_lev; + + if (*verbose){ + printf("-----------------------------------------------------------------\n"); + printf("Initialising a Panphasia subgrid\n"); + printf("Relative level %llu\n",rel_level); + printf("Relative origin (%llu,%llu,%llu)\n",rel_origin_x,rel_origin_y,rel_origin_z); + printf("The maximum possible extent of this subgrid is %llu cells\n",rel_coord_max); + printf("-----------------------------------------------------------------\n"); + + }; + + panphasia_rel_origin_set = 1; + + return(0); + + +}; + + +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== +int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart, + size_t *xextent, size_t *yextent, size_t *zextent, + size_t *copy_list, + size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose){ + + + size_t cumulative_cell_index[Nbasis+1]; + size_t level_max = descriptor_base_level+rel_level; + size_t cell_memory_to_allocate; + + + //ticks tic_tot; + + //ticks tic_start = getticks(); + + + + //================== Basic error checking of input parameters ========== + + if (panphasia_rel_origin_set!=1) return(200); + if (*xstart>=rel_coord_max) return(201); + if (*ystart>=rel_coord_max) return(202); + if (*zstart>=rel_coord_max) return(203); + + if ((*xextent>rel_coord_max)||(*xextent==0)) return(204); + if ((*yextent>rel_coord_max)||(*yextent==0)) return(205); + if ((*zextent>rel_coord_max)||(*zextent==0)) return(206); + + if ((*ncopy<0)||(*ncopy>Nbasis)) return(207); + + if ((copy_list[0]<0)||(copy_list[*ncopy-1]>=Nbasis)) return(208); + + + for (int i=1; i<*ncopy; i++) if (copy_list[i]<=copy_list[i-1]) return(209); + + + //======================================================================= + // Allocate storage for one dimensional x,y,z cell coordinate lists + //======================================================================= + +size_t nreturn_x = 2*(*xextent) + 200; +size_t nreturn_y = 2*(*yextent) + 200; +size_t nreturn_z = 2*(*zextent) + 200; + +size_t *ret_x_list_coords = malloc(sizeof(size_t)*nreturn_x); + if (ret_x_list_coords==NULL) return(220); +size_t *ret_y_list_coords = malloc(sizeof(size_t)*nreturn_y); + if (ret_y_list_coords==NULL) return(221); +size_t *ret_z_list_coords = malloc(sizeof(size_t)*nreturn_z); + if (ret_z_list_coords==NULL) return(222); + +long long int *child_pointer_x = malloc(sizeof(size_t)*2*nreturn_x); + if (child_pointer_x==NULL) return(223); +long long int *child_pointer_y = malloc(sizeof(size_t)*2*nreturn_y); + if (child_pointer_x==NULL) return(224); +long long int *child_pointer_z = malloc(sizeof(size_t)*2*nreturn_z); + if (child_pointer_z==NULL) return(225); + +size_t level_begin_x[64],level_count_x[64]; +size_t level_begin_y[64],level_count_y[64]; +size_t level_begin_z[64],level_count_z[64]; + +size_t *index_perm_x = malloc(sizeof(size_t)*nreturn_x); + if (index_perm_x==NULL) return(226); +size_t *index_perm_y = malloc(sizeof(size_t)*nreturn_y); + if (index_perm_y==NULL) return(226); +size_t *index_perm_z = malloc(sizeof(size_t)*nreturn_z); + if (index_perm_z==NULL) return(226); + +size_t *list_cell_x_coord = malloc(sizeof(size_t)*(*xextent)); + if (list_cell_x_coord==NULL) return(227); +size_t *list_cell_y_coord = malloc(sizeof(size_t)*(*yextent)); + if (list_cell_y_coord==NULL) return(228); +size_t *list_cell_z_coord = malloc(sizeof(size_t)*(*zextent)); + if (list_cell_z_coord==NULL) return(229); + +//================================================================ +// Make x,y,z lists of cell coordinates // +//================================================================ +{ + for (size_t i =0; i<*xextent; i++){ + size_t xabs,yabs,zabs; + calc_absolute_coordinates(*xstart+i,*ystart,*zstart,&xabs,&yabs,&zabs); + list_cell_x_coord[i] = xabs; + }; + + for (size_t i =0; i<*yextent; i++){ + size_t xabs,yabs,zabs; + calc_absolute_coordinates(*xstart,*ystart+i,*zstart,&xabs,&yabs,&zabs); + list_cell_y_coord[i] = yabs; + }; + + for (size_t i =0; i<*zextent; i++){ + size_t xabs,yabs,zabs; + calc_absolute_coordinates(*xstart,*ystart,*zstart+i,&xabs,&yabs,&zabs); + list_cell_z_coord[i] = zabs; + }; + +}; +//================================================================ +// Generate 1-D binary trees for each of the x,y,z cuboid dimensions +//================================================================ +{ +int error_code; + + +if (error_code = return_binary_tree_cell_lists(level_max, list_cell_x_coord, + *xextent, ret_x_list_coords, nreturn_x, child_pointer_x, + level_count_x, level_begin_x, index_perm_x)) return(error_code); +if (error_code = return_binary_tree_cell_lists(level_max, list_cell_y_coord, + *yextent, ret_y_list_coords, nreturn_y, child_pointer_y, + level_count_y, level_begin_y, index_perm_y)) return(error_code); +if (error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, + *zextent, ret_z_list_coords, nreturn_z, child_pointer_z, + level_count_z, level_begin_z, index_perm_z)) return(error_code); + +}; + //=================================================================== + // Allocate memory to store all the cell properties + //=================================================================== + { + size_t number_of_cells = 0; + + for(int i=level_max; i>=0; i--) { + cumulative_cell_index[i] = number_of_cells; + number_of_cells += level_count_x[i]*level_count_y[i]*level_count_z[i]; + }; + + + if (*verbose) printf("Total number cells: %llu \n",number_of_cells); + + cell_memory_to_allocate = sizeof(PAN_REAL) * number_of_cells * Nbasis; + }; + + PAN_REAL *working_space = malloc(cell_memory_to_allocate); + if (working_space==NULL) return (210); + + //======================================================================================== + // Loop over octree starting at the root, for all relevant cells at each level + //======================================================================================== + size_t total_number_cells=0; + size_t num_cell_compute=0; + size_t num_level_max_cells=0; + size_t total_num_children=0; +{ + size_t cell_index,j1,j2,j3; + size_t child_cells[8]; + size_t xoffset,yoffset,zoffset; + size_t ix,iy,iz; + size_t xco,yco,zco; + size_t child_index,work_index,selected_child_index; + size_t i; + + + + PAN_REAL parent[Nbasis]; + PAN_REAL children[8*Nbasis]; + + if (level_max==0) return_root_legendre_coefficients_(working_space); // Return root cell coefficients + + +#ifdef USE_OPENMP + double start, end; + start = omp_get_wtime(); + if (*verbose) printf("Start ...\n"); +#endif + + + + for (size_t level=0; level < level_max; level++){ +#ifdef USE_OPENMP +#pragma omp parallel for collapse(3) \ + private (cell_index,xoffset,yoffset,zoffset,j1,j2,j3,ix,iy,iz, \ + xco,yco,zco,child_index,work_index,selected_child_index,i, \ + parent,children) +#endif + for (int cell_x=0; cell_x < level_count_x[level]; cell_x++) + for (int cell_y=0; cell_y < level_count_y[level]; cell_y++) + for (int cell_z = 0; cell_z < level_count_z[level]; cell_z++){ + + cell_index = cumulative_cell_index[level] + cell_x*level_count_y[level]*level_count_z[level]+ + cell_y*level_count_z[level] + cell_z; + + xoffset = level_begin_x[level] + cell_x; + yoffset = level_begin_y[level] + cell_y; + zoffset = level_begin_z[level] + cell_z; + + j1 = ret_x_list_coords[xoffset]; + j2 = ret_y_list_coords[yoffset]; + j3 = ret_z_list_coords[zoffset]; + + if (level==0){ return_root_legendre_coefficients_(parent); // Root cell parent information + }else{ + for(i=0; i1) printf("Cell: L%llu %llu %llu %llu\n",level,j1,j2,j3); + + + }; // z/y/x-coordinate/level + + + // if (flag_nochildren!=0) return(211); //All cells should have at least one child +}; // End loop over levels + + +#ifdef USE_OPENMP + + end = omp_get_wtime(); + + if (*verbose) printf("End ...\n"); + + + double cpu_time_used = ((double) (end - start)); + + + if (*verbose) printf("Time in OMP Section = %lf seconds \n",cpu_time_used); + +#endif + + +}; + +//======================================================================================== +// Assign data from work_space to the input array +//======================================================================================== +{ + + + + FFTW_REAL *ptr_real = output_values; + FFTW_COMPLEX *ptr_cmplx = output_values; + size_t zdimension = (*flag_output_mode==2) ? *zextent + 2 : *zextent; // For R2C pad by two in z-dimension + + + //printf("zdimension = %ld\n",zdimension); + + // PAN_COMPLEX *ptr_cplx; + // *ptr_real =(* PAN_REAL) *output_values; + // *ptr_cplx = output_values + + for (size_t xco=0;xco<*xextent;xco++)for(size_t yco=0;yco<*yextent;yco++)for(size_t zco=0; zco<*zextent;zco++){ + size_t xloc = index_perm_x[xco], yloc = index_perm_y[yco], zloc = index_perm_z[zco]; + + size_t index = Nbasis*( xco*(*yextent)*(*zextent) + yco*(*zextent) + zco); + size_t out_v_index = *ncopy*(xloc*(*yextent)*zdimension + yloc*zdimension + zloc); + + + if (*flag_output_mode==1){ + + for (size_t i=0; i<*ncopy; i++) ptr_cmplx[out_v_index+i] = (FFTW_COMPLEX) working_space[index+copy_list[i]]; + + }else{ + + for (size_t i=0; i<*ncopy; i++) ptr_real[out_v_index+i] = working_space[index+copy_list[i]]; + + + }; + }; + + + }; + + +//===========================================(============================================== +// Free all memory (in order of calls to malloc above) +//========================================================================================= + free(ret_x_list_coords); + free(ret_y_list_coords); + free(ret_z_list_coords); + + free(child_pointer_x); + free(child_pointer_y); + free(child_pointer_z); + + free(index_perm_x); + free(index_perm_y); + free(index_perm_z); + + + free(list_cell_x_coord); + free(list_cell_y_coord); + free(list_cell_z_coord); + + + free(working_space); + + //tic_tot = getticks()-tic_start; + + + + // if (*verbose) printf("Total child cells at deepest level %llu \n",num_level_max_cells); + //if (*verbose) printf("Total number of cells computed %llu \n",num_cell_compute); + //if (*verbose) printf("Total number of child cells %llu \n",total_num_children); + //if (*verbose) printf("Time to compute %llu cells at level %llu: %.3f %s \n",num_level_max_cells, + // level_max, clocks_from_ticks(tic_tot), clocks_getunit()); + +//======================================================================================= + return(0); +//======================================================================================= +}; +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== + + + + + +int parse_and_validate_descriptor_(char *descriptor){ + + + char *token; + const char split[20] = "[,()]"; + char copy[300]; + size_t desc_order, desc_level, desc_x, desc_y, desc_z,desc_size; + char desc_name[100]; + size_t desc_kk_limit = 0; + long long int desc_ch,comp_ch; + int kk_limit_set = 0; + int nelement = 0; + char descriptor_as_read[300]; + + strcpy(copy,descriptor); + + + token = strtok(copy, split); + + + while( token != NULL ) { + nelement++; + + // Read in compulsory elements + + switch(nelement){ + case 1: + if (sscanf(token,"Panph%llu",&desc_order)!=1) return (440001); + break; + case 2: + if (sscanf(token,"L%llu",&desc_level)!=1) return 440002; + break; + case 3: + if (sscanf(token,"%llu",&desc_x)!=1) return 440003; + break; + case 4: + if (sscanf(token,"%llu",&desc_y)!=1) return 440004; + break; + case 5: + if (sscanf(token,"%llu",&desc_z)!=1) return 440005; + break; + case 6: + if (sscanf(token,"S%llu",&desc_size)!=1) return 440005; + break; + case 7: + if (sscanf(token,"KK%lld",&desc_kk_limit)==1) { + kk_limit_set=1; + token = strtok(NULL, split); + }; + if (sscanf(token,"CH%lld",&desc_ch)!=1) return 440006; + break; + case 8: + if (sscanf(token,"%s",&desc_name)!=1) return 440007; + break; + }; + token = strtok(NULL, split); + }; + + + if (kk_limit_set==0){ + sprintf(descriptor_as_read,"[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", + desc_order,desc_level,desc_x,desc_y,desc_z,desc_size,desc_ch,desc_name); + } else{ + sprintf(descriptor_as_read,"[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", + desc_order,desc_level,desc_x,desc_y,desc_z,desc_size,desc_kk_limit,desc_ch,desc_name); + + }; + + if (strcmp(descriptor,descriptor_as_read)){ + printf("Error - descriptor mismatch\n"); + printf("As read in: %s\n",descriptor_as_read); + printf(" %s\n",descriptor); + }; + + + // Valid format descriptor has been passed - store values + + + + descriptor_order = desc_order; + descriptor_base_level = desc_level; + descriptor_xorigin = desc_x; + descriptor_yorigin = desc_y; + descriptor_zorigin = desc_z; + descriptor_base_size = desc_size; + descriptor_kk_limit = desc_kk_limit; + descriptor_check_digit = desc_ch; + strcpy(descriptor_name, desc_name); + strcpy(full_descriptor,descriptor); + descriptor_read_in = 1; + + comp_ch = compute_check_digit_(); // check the check digit + + if ((desc_ch!=-999)&&(desc_ch!=comp_ch)){ + descriptor_read_in = 0; + printf("Check digit read in %llu\n Check digit expected %llu\n",desc_ch,comp_ch); + return (44008); + }; + + + return(0); + +}; + + + +void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs){ + + *xabs = ((descriptor_xorigin<=cumulative_cell_index[descriptor_base_level+rel_level+1]) return(301); + + size_t cell_level; + for (cell_level = descriptor_base_level+rel_level; + cell_id < cumulative_cell_index[cell_level];cell_level--); + + size_t local_id = cell_id - cumulative_cell_index[cell_level]; + + *cell_x = local_id/(cuboid_y_dimen[cell_level]*cuboid_z_dimen[cell_level]); + *cell_y = (local_id - *cell_x*cuboid_y_dimen[cell_level]*cuboid_z_dimen[cell_level])/cuboid_z_dimen[cell_level]; + *cell_z = local_id%cuboid_z_dimen[cell_level]; + + //printf("Cell level %llu x %llu y %llu z %llu\n",cell_level,*cell_x,*cell_y,*cell_z); + + + + return(0); + +}; + + + + +int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates, + size_t extent, size_t *return_tree_list_coordinates, size_t nreturn, + long long int *child_pointer, size_t *level_count, + size_t *level_begin, size_t *index_perm){ + +if (extent==0) return(401); +if (nreturn<2*extent+192) return(402); + +for (size_t i=0; i<2*nreturn;i++) child_pointer[i]=-1; + +{ size_t stride=1; + for(size_t i=0; i0; level--){ + + offset =level_begin[level]+level_count[level]; + counter = 0; + + abs_coord = return_tree_list_coordinates[level_begin[level]]; + + return_tree_list_coordinates[offset] = abs_coord>>1; + child_pointer[2*offset + abs_coord%2] = level_begin[level]; + + for(size_t cell = 1; cell>1 == return_tree_list_coordinates[offset+counter]){ + child_pointer[2*offset + 2*counter + abs_coord%2] = level_begin[level]+cell; + }else{ + counter++; + return_tree_list_coordinates[offset+counter] = abs_coord>>1; + child_pointer[2*offset + 2*counter + abs_coord%2] = level_begin[level]+cell; + }; + + }; //cell loop + + + level_count[level-1] = ++counter; + level_begin[level-1] = level_begin[level] + level_count[level]; + }; // level loop + +return(0); + +}; +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +// +// Test code for checking the appropriate moments are preserved +// between levels in Panphasia +// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// + +#include + + + +void integrate_cell( int, int, int, size_t, size_t, size_t, FFTW_REAL * , double *); +int compute_panphasia_(double, double, double, size_t, size_t, size_t, FFTW_REAL *, double *); + +void test_cell_moments(char *,size_t, size_t, size_t, size_t, size_t, double *); + +//////////////////////////////////////////////////////////////////////////////// +void test_moments_(){ + + + int lev = 10; + char descriptor_demo[300]="Hello!"; + printf("Demo string %s\n",descriptor_demo); + + // descriptor_pair_generate_();//, descriptor_demo); + printf("Parameters: %s\n",descriptor_demo); + + + + + size_t nlevel=1; + double coefficients1[Nbasis]; + double coefficients2[Nbasis]; + + double max_diff2=0.0; + double rms_diff2=0.0; + + + char descriptor[200]; + + size_t const xco_full = 0x7504f333f9de6497; + size_t const yco_full = 0x67ea73c992a3355c; + size_t const zco_full = 0x5ab50a5892e98768; + + size_t xco = 0; size_t yco = 0; size_t zco=0; + + verbose_warnings_only=1; // Minimize output to screen. + + for (size_t level=0; level<63; level++){ + + xco = (xco_full)>>(63-level); + yco = (yco_full)>>(63-level); + zco = (zco_full)>>(63-level); + + sprintf(descriptor,"[Panph6,L%ld,(%llu,%llu,%llu),S1,CH-999,test]",level,xco,yco,zco); + // printf("%s\n",descriptor); + + test_cell_moments(descriptor,0,0,0,0,1,coefficients1); + + test_cell_moments(descriptor,1,0,0,0,2,coefficients2); + + for (int i=0; imax_diff2) max_diff2=diff2; + rms_diff2+=diff2; + }; + + rms_diff2/=(double)Nbasis; + + // for (int i=0; i1.e-12)||(rms_diff2>1.e-12)){ + printf("Moments not accurately recovered at single precision\n"); abort(); + }; + + }else{ + + if ((max_diff2>1.e-24)||(rms_diff2>1.e-24)){ + printf("Moments not accurately recovered at double precision\n"); abort(); + }; + + }; + + //printf("Acceptable differences: %e RMS difference %e\n",sqrt(max_diff2),sqrt(rms_diff2)); + +}; + + printf("Completed moment test successfully.\n"); + + +}; + +void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_orig_x, + size_t rel_orig_y, size_t rel_orig_z, size_t extent, double *coeff ){ + +int error_code; +int verbose = 0; + int flag_output_mode=0; + +PANPHASIA_init_descriptor_(root_descriptor,&verbose); + + + verbose = 0; + + + if (error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x,&rel_orig_y,&rel_orig_z,&verbose)){ + printf("Error %d in initialing PANPHASIA_init_level_\n", + error_code); + }; + + size_t xstart = 0, ystart = 0, zstart = 0; + + size_t xextent, yextent, zextent; + + xextent = extent; yextent=extent; zextent=extent; + size_t copy_list[Nbasis]; + for (int i=0; i=10){printf("Higher order Gaussian Quadrature needed!\n");abort();}; + + double a = 0.0; + double b = 1.0; + + double middle = 0.5*(b+a); + double range = 0.5*(b-a); + + double sum[Nbasis] = {0.0}; + + double test_sum = 0.0; + + for (int i=0; i<10; i++){ + double xp = middle + range*abscissa[i]; + for (int j=0; j<10; j++){ + double yp = middle + range*abscissa[j]; + for (int k=0; k<10; k++){ + double zp = middle + range*abscissa[k]; + //////////////////////////////////////////////////////////////////////////////////////// + + double panphasia_value; + double xv = (double)ix + xp; + double yv = (double)iy + yp; + double zv = (double)iz + zp; + + if (compute_panphasia_(xv,yv,zv,xextent,yextent,zextent,output_values, + &panphasia_value)==1){ + printf("Call to compute_panphasia_ out of range \n");abort(); + }; + + + double uq,vq,wq; + + uq = 2.0*(xv/(double)yextent)-1.0; + vq = 2.0*(yv/(double)yextent)-1.0; + wq = 2.0*(zv/(double)yextent)-1.0; + + int p = p_order; + + + double lgp_uq[p_order+1]; + double lgp_vq[p_order+1]; + double lgp_wq[p_order+1]; + + gsl_sf_legendre_Pl_array(p,uq,lgp_uq); + gsl_sf_legendre_Pl_array(p,vq,lgp_vq); + gsl_sf_legendre_Pl_array(p,wq,lgp_wq); + + for (int ii=0; ii=(double)xextent)) return (1); + if ((y<0)||(y>=(double)yextent)) return (1); + if ((z<0)||(z>=(double)zextent)) return (1); + + int ix = (int)x; + int iy = (int)y; + int iz = (int)z; + + double up = 2.0*(x-ix)-1.0; + double vp = 2.0*(y-iy)-1.0; + double wp = 2.0*(z-iz)-1.0; + + double lgp_up[p_order+1]; + double lgp_vp[p_order+1]; + double lgp_wp[p_order+1]; + + int p = p_order; + + gsl_sf_legendre_Pl_array(p,up,lgp_up); + gsl_sf_legendre_Pl_array(p,vp,lgp_vp); + gsl_sf_legendre_Pl_array(p,wp,lgp_wp); + + for (int i=0; i +#include +#include +#include +#include +#include +#include + + + +#include "PAN_FFTW3.h" +#include "panphasia_functions.h" + +extern size_t descriptor_base_size; + +#ifdef USE_OPENMP +#include +int threads_ok; +int number_omp_threads = 1; +#endif + + + +int main(int argc, char **argv) +{ + +int verbose=0; +int error; +size_t x0=0, y0=0, z0=0; +size_t rel_level; +char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + +#ifdef USE_OPENMP + omp_set_num_threads(number_omp_threads); + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + threads_ok = provided >= MPI_THREAD_FUNNELED; + if (threads_ok) threads_ok = fftw_init_threads(); + fftw_mpi_init(); + int num_threads = number_omp_threads ; + if (threads_ok){ + fftw_plan_with_nthreads(num_threads); + }else{ + printf("Failure to initialise threads ...\n"); + MPI_Finalize(); + }; + + printf("OpenMP threads enabled with FFTW. Number of threads %d\n",fftw_planner_nthreads()); +#else + MPI_Init(&argc, &argv); +#endif + + PANPHASIA_init_descriptor_(descriptor,&verbose); + + + rel_level = 6; //Set size of test dataset + + +if (error=PANPHASIA_init_level_(&rel_level,&x0,&y0,&z0,&verbose)){ + printf("Abort: PANPHASIA_init_level_ :error code %d\n",error); + abort(); +}; + +//======================= FFTW ============================== + +fftw_mpi_init(); + +ptrdiff_t alloc_local, local_n0, local_0_start; + +ptrdiff_t N0 = descriptor_base_size< +#include +#include +#include +#include +#include +#include + +#include "PAN_FFTW3.h" +#include "panphasia_functions.h" + +#ifdef USE_OPENMP +#include +#endif + + +extern const int Nbasis; +extern const int irank_p[3][84]; + +extern size_t descriptor_order; +extern size_t descriptor_kk_limit; + + +int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_grid, + ptrdiff_t local_n0_return, ptrdiff_t local_0_start_return, + FFTW_COMPLEX *return_field) +{ + +size_t copy_list[Nbasis]; +int fdim=1; +int pmax = 6; +size_t ncopy = (pmax+1)*(pmax+2)*(pmax+3)/6; +size_t xorigin=local_0_start_return, yorigin=0, zorigin=0; +size_t xextent =local_n0_return, yextent = N0_grid, zextent = N0_grid; +int verbose = 1; +int flag_output_mode=2; +int error; +ptrdiff_t size_to_alloc; +FFTW_PLAN output_coeff_forward_plan; + + +if (pmax>descriptor_order) return(100000); + +for (size_t i=0; infft_dim/2) ? + ix + local_0_start_return - nfft_dim : ix + local_0_start_return; + ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; + kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; + + if ( (kx==nfft_dim/2)||(ky==nfft_dim/2)||(kz==nfft_dim/2)){ + // Set Nyquist modes to zero - not used by IC_Gen anyway. + phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift + }else{ + phase_shift_and_scale = + cexp( (-I)*pi*(double)(kx + ky + kz)/(double)nfft_dim)/pow((double)nfft_dim,1.5); + }; + + return_field[index1] *= phase_shift_and_scale; + + }; + + + }; + + printf("Reached here 11!\n"); + + + + // Rescale selected Fourier modes to unit amplitude. + // By default this part is not executed. + + if (descriptor_kk_limit>0){ + size_t index1; + complex weight; + size_t ksquared; + int kx,ky,kz; +#ifdef USE_OPENMP +#pragma omp parallel for collapse(3) \ + private (index1,kx,ky,kz,ksquared,weight) +#endif + for(int ix=0;ixnfft_dim/2) ? + ix + local_0_start_return - nfft_dim : ix + local_0_start_return; + ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; + kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; + ksquared = kx*kx + ky*ky + kz*kz; + if (ksquared<=descriptor_kk_limit){ + index1 = ix*N0_grid*(N0_grid/2+1) + iy*(N0_grid/2+1) + iz; + weight = cabs(return_field[index1]); + return_field[index1] /= weight; + }; + }; + + }; + +printf("Reached here 12!\n"); + + +if (nfft_dim <128){ + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + char filename[100]; + sprintf(filename,"output_k_space_field.%d",rank); + + int xuse,yuse,zuse; + FFTW_REAL sign; + + FILE *fp; + + fp = fopen(filename,"w"); + + for (int ix=0; ixnfft_dim/2){ + xuse = (nfft_dim-ix)%nfft_dim; + yuse = (nfft_dim-iy)%nfft_dim; + zuse = (nfft_dim-iz)%nfft_dim; + sign = -1.0; + }else{ + xuse = ix; + yuse = iy; + zuse = iz; + sign = 1.0; + }; + + int index = xuse*N0_grid*(N0_grid/2+1) + yuse*(N0_grid/2+1) + zuse; + fprintf(fp,"%6d%6d%6d %14.8lf %14.8lf\n",ix+local_0_start_return,iy,iz, + creal(return_field[index]),cimag(sign*return_field[index])); + }; + fclose(fp); + + + + }; + + + printf("Reached here 14!\n"); + +for (int j=0; j<4; j++){ + for (int i=0; i<4; i++) printf("(%lf %lf) ",creal(return_field[j+ i*ncopy]), + cimag(return_field[j + i*ncopy])); printf("\n"); + }; + + + +FFTW_FREE(output_coefficients); +FFTW_FREE(sph_bessel_coeff); + + + FFTW_DESTROY_PLAN(output_coeff_forward_plan); + + printf("Reached here! 3 \n"); + return(0); + + + }; + + +//========================================================================================== +//========================================================================================== + + + + + diff --git a/external/panphasia_ho/panphasia_functions.h b/external/panphasia_ho/panphasia_functions.h new file mode 100644 index 0000000..363342b --- /dev/null +++ b/external/panphasia_ho/panphasia_functions.h @@ -0,0 +1,93 @@ + +///////////////////////////////////////////////// +// By default Panphasia is computed at single +// precision. To override this define PAN_DOUBLE + +#define PAN_DOUBLE_PRECISION 8 + + +#ifndef PAN_DOUBLE_PRECISION + +#define PAN_REAL float +#define PAN_COMPLEX float complex + +#else + +#define PAN_REAL double +#define PAN_COMPLEX double complex + +#endif + + + + +///////////////////////////////////////////////////////////////////// + + + +void return_uniform_pseudo_rands_threefry4x64_(size_t l,size_t j1,size_t j2,size_t j3, + PAN_REAL *panphasia_randoms, size_t seed_value, + size_t allow_non_zero_seed_safety_catch); + +void box_muller_(PAN_REAL *unif_real,PAN_REAL *gvar); + +void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_cell_vec, int control_flag); + +void threefry4x64_test_(int verbose); +void inverse_threefry4x64_test_(int verbose); +void set_panphasia_key_(int verbose); +void check_panphasia_key_(int verbose); + +void PANPHASIA_init_descriptor_checks(); + +void speed_test_(); +void speed_test2_(); +void check_randoms_(); +void test_random_dist_(size_t shift); +void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3, + PAN_REAL *gauss_rand_parent, PAN_REAL *legendre_rand); +void return_root_legendre_coefficients_(PAN_REAL *root); + + +int parse_and_validate_descriptor_(char *); +int demo_descriptor_(); +long long int compute_check_digit_(); +int PANPHASIA_init_descriptor_(char *descriptor, int *verbose); +int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose); + + +int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart, + size_t *xextent, size_t *yextent, size_t *zextend, + size_t *copy_list, + size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose); + +void test_moments_(); +void test_propogation_of_moments_(int iterations); +void test_cell_moments(char *,size_t, size_t, size_t, size_t, size_t, double *); + +void spherical_bessel_(int *, double *, double *); + + + + + + +void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs); + +int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen, + size_t *cuboid_y_dimen,size_t *cuboid_z_dimen, size_t *cell_lev, + size_t *cell_x, size_t *cell_y, size_t *cell_z, size_t number_children, + size_t *child_cell_indices); + +int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates, + size_t extent, size_t *return_tree_list_coordinates, + size_t nreturn, + long long int *child_pointer, size_t *level_count, size_t *level_begin, size_t *index_perm); + + + + + +void compute_sph_bessel_coeffs(int, int, int, int, double complex *); + +int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *); diff --git a/external/panphasia_ho/threefry.h b/external/panphasia_ho/threefry.h new file mode 100644 index 0000000..6cff11d --- /dev/null +++ b/external/panphasia_ho/threefry.h @@ -0,0 +1,874 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _threefry_dot_h_ +#define _threefry_dot_h_ +#include "features/compilerfeatures.h" +#include "array.h" + +/** \cond HIDDEN_FROM_DOXYGEN */ +/* Significant parts of this file were copied from + from: + Skein_FinalRnd/ReferenceImplementation/skein.h + Skein_FinalRnd/ReferenceImplementation/skein_block.c + + in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip + + This file has been modified so that it may no longer perform its originally + intended function. If you're looking for a Skein or Threefish source code, + please consult the original file. + + The original file had the following header: +************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** + +*/ + +/* See comment at the top of philox.h for the macro pre-process + strategy. */ + +/* Rotation constants: */ +enum r123_enum_threefry64x4 { + /* These are the R_256 constants from the Threefish reference sources + with names changed to R_64x4... */ + R_64x4_0_0=14, R_64x4_0_1=16, + R_64x4_1_0=52, R_64x4_1_1=57, + R_64x4_2_0=23, R_64x4_2_1=40, + R_64x4_3_0= 5, R_64x4_3_1=37, + R_64x4_4_0=25, R_64x4_4_1=33, + R_64x4_5_0=46, R_64x4_5_1=12, + R_64x4_6_0=58, R_64x4_6_1=22, + R_64x4_7_0=32, R_64x4_7_1=32 +}; + +enum r123_enum_threefry64x2 { + /* + // Output from skein_rot_search: (srs64_B64-X1000) + // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 + // Start: Tue Mar 1 10:07:48 2011 + // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format + */ + R_64x2_0_0=16, + R_64x2_1_0=42, + R_64x2_2_0=12, + R_64x2_3_0=31, + R_64x2_4_0=16, + R_64x2_5_0=32, + R_64x2_6_0=24, + R_64x2_7_0=21 + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 8 [ 8 8 8 8 ] + // 6 rounds: minHW = 16 [ 16 16 16 16 ] + // 7 rounds: minHW = 32 [ 32 32 32 32 ] + // 8 rounds: minHW = 64 [ 64 64 64 64 ] + // 9 rounds: minHW = 64 [ 64 64 64 64 ] + //10 rounds: minHW = 64 [ 64 64 64 64 ] + //11 rounds: minHW = 64 [ 64 64 64 64 ] */ +}; + +enum r123_enum_threefry32x4 { + /* Output from skein_rot_search: (srs-B128-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Mon Aug 24 22:41:36 2009 + // ... + // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */ + R_32x4_0_0=10, R_32x4_0_1=26, + R_32x4_1_0=11, R_32x4_1_1=21, + R_32x4_2_0=13, R_32x4_2_1=27, + R_32x4_3_0=23, R_32x4_3_1= 5, + R_32x4_4_0= 6, R_32x4_4_1=20, + R_32x4_5_0=17, R_32x4_5_1=11, + R_32x4_6_0=25, R_32x4_6_1=10, + R_32x4_7_0=18, R_32x4_7_1=20 + + /* 4 rounds: minHW = 3 [ 3 3 3 3 ] + // 5 rounds: minHW = 7 [ 7 7 7 7 ] + // 6 rounds: minHW = 12 [ 13 12 13 12 ] + // 7 rounds: minHW = 22 [ 22 23 22 23 ] + // 8 rounds: minHW = 31 [ 31 31 31 31 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + +}; + +enum r123_enum_threefry32x2 { + /* Output from skein_rot_search (srs32x2-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Tue Jul 12 11:11:33 2011 + // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */ + R_32x2_0_0=13, + R_32x2_1_0=15, + R_32x2_2_0=26, + R_32x2_3_0= 6, + R_32x2_4_0=17, + R_32x2_5_0=29, + R_32x2_6_0=16, + R_32x2_7_0=24 + + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 6 [ 6 8 6 8 ] + // 6 rounds: minHW = 9 [ 9 12 9 12 ] + // 7 rounds: minHW = 16 [ 16 24 16 24 ] + // 8 rounds: minHW = 32 [ 32 32 32 32 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + }; + +enum r123_enum_threefry_wcnt { + WCNT2=2, + WCNT4=4 +}; + +#if R123_USE_64BIT +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) +{ + return (x << (N & 63)) | (x >> ((64-N) & 63)); +} +#endif + +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32-N) & 31)); +} + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_KS_PARITY32 0x1BD11BDA + +/** \endcond */ + +#ifndef THREEFRY2x32_DEFAULT_ROUNDS +#define THREEFRY2x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY2x64_DEFAULT_ROUNDS +#define THREEFRY2x64_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x32_DEFAULT_ROUNDS +#define THREEFRY4x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x64_DEFAULT_ROUNDS +#define THREEFRY4x64_DEFAULT_ROUNDS 20 +#endif + +#define _threefry2x_tpl(W) \ +typedef struct r123array2x##W threefry2x##W##_ctr_t; \ +typedef struct r123array2x##W threefry2x##W##_key_t; \ +typedef struct r123array2x##W threefry2x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + uint##W##_t X0,X1; \ + uint##W##_t ks0, ks1, ks2; \ + R123_ASSERT(Nrounds<=32); \ + ks2 = SKEIN_KS_PARITY##W; \ + ks0 = k.v[0]; \ + X0 = in.v[0] + ks0; \ + ks2 ^= ks0; \ +\ + ks1 = k.v[1]; \ + X1 = in.v[1] + ks1; \ + ks2 ^= ks1; \ + \ + if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \ + if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \ + if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \ + if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X0 += ks1; X1 += ks2; \ + X1 += 1; /* X.v[2-1] += r */ \ + } \ + if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \ + if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \ + if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \ + if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X0 += ks2; X1 += ks0; \ + X1 += 2; \ + } \ + if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \ + if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \ + if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \ + if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X0 += ks0; X1 += ks1; \ + X1 += 3; \ + } \ + if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \ + if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \ + if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \ + if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \ + if(Nrounds>15){ \ + /* InjectKey(r=4) */ \ + X0 += ks1; X1 += ks2; \ + X1 += 4; \ + } \ + if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \ + if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \ + if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \ + if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \ + if(Nrounds>19){ \ + /* InjectKey(r=5) */ \ + X0 += ks2; X1 += ks0; \ + X1 += 5; \ + } \ + if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \ + if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \ + if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \ + if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \ + if(Nrounds>23){ \ + /* InjectKey(r=6) */ \ + X0 += ks0; X1 += ks1; \ + X1 += 6; \ + } \ + if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \ + if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \ + if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \ + if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \ + if(Nrounds>27){ \ + /* InjectKey(r=7) */ \ + X0 += ks1; X1 += ks2; \ + X1 += 7; \ + } \ + if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \ + if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \ + if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \ + if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \ + if(Nrounds>31){ \ + /* InjectKey(r=8) */ \ + X0 += ks2; X1 += ks0; \ + X1 += 8; \ + } \ + threefry2x##W##_ctr_t ret={{X0, X1}}; \ + return ret; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \ +} + + +#define _threefry4x_tpl(W) \ +typedef struct r123array4x##W threefry4x##W##_ctr_t; \ +typedef struct r123array4x##W threefry4x##W##_key_t; \ +typedef struct r123array4x##W threefry4x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + uint##W##_t X0, X1, X2, X3; \ + uint##W##_t ks0, ks1, ks2, ks3, ks4; \ + R123_ASSERT(Nrounds<=72); \ + ks4 = SKEIN_KS_PARITY##W; \ + ks0 = k.v[0]; \ + X0 = in.v[0] + ks0; \ + ks4 ^= ks0; \ + \ + ks1 = k.v[1]; \ + X1 = in.v[1] + ks1; \ + ks4 ^= ks1; \ + \ + ks2 = k.v[2]; \ + X2 = in.v[2] + ks2; \ + ks4 ^= ks2; \ + \ + ks3 = k.v[3]; \ + X3 = in.v[3] + ks3; \ + ks4 ^= ks3; \ + \ + if(Nrounds>0){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>1){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>2){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>3){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \ + X3 += 1; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>4){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>5){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>6){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>7){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \ + X3 += 2; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>8){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>9){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>10){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>11){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \ + X3 += 3; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>12){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>13){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>14){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>15){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>15){ \ + /* InjectKey(r=1) */ \ + X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \ + X3 += 4; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>16){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>17){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>18){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>19){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>19){ \ + /* InjectKey(r=1) */ \ + X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \ + X3 += 5; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>20){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>21){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>22){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>23){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>23){ \ + /* InjectKey(r=1) */ \ + X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \ + X3 += 6; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>24){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>25){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>26){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>27){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>27){ \ + /* InjectKey(r=1) */ \ + X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \ + X3 += 7; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>28){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>29){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>30){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>31){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>31){ \ + /* InjectKey(r=1) */ \ + X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \ + X3 += 8; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>32){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>33){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>34){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>35){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>35){ \ + /* InjectKey(r=1) */ \ + X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \ + X3 += 9; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>36){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>37){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>38){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>39){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>39){ \ + /* InjectKey(r=1) */ \ + X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \ + X3 += 10; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>40){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>41){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>42){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>43){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>43){ \ + /* InjectKey(r=1) */ \ + X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \ + X3 += 11; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>44){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>45){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>46){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>47){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>47){ \ + /* InjectKey(r=1) */ \ + X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \ + X3 += 12; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>48){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>49){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>50){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>51){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>51){ \ + /* InjectKey(r=1) */ \ + X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \ + X3 += 13; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>52){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>53){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>54){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>55){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>55){ \ + /* InjectKey(r=1) */ \ + X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \ + X3 += 14; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>56){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>57){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>58){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>59){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>59){ \ + /* InjectKey(r=1) */ \ + X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \ + X3 += 15; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>60){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>61){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>62){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>63){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>63){ \ + /* InjectKey(r=1) */ \ + X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \ + X3 += 16; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>64){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \ + } \ + if(Nrounds>65){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \ + } \ + if(Nrounds>66){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \ + } \ + if(Nrounds>67){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \ + } \ + if(Nrounds>67){ \ + /* InjectKey(r=1) */ \ + X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \ + X3 += 17; /* XWCNT4-1 += r */ \ + } \ + \ + if(Nrounds>68){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \ + } \ + if(Nrounds>69){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \ + } \ + if(Nrounds>70){ \ + X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \ + X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \ + } \ + if(Nrounds>71){ \ + X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \ + X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \ + } \ + if(Nrounds>71){ \ + /* InjectKey(r=1) */ \ + X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \ + X3 += 18; /* XWCNT4-1 += r */ \ + } \ + \ + threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \ + return ret; \ +} \ + \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \ +} + +#if R123_USE_64BIT +_threefry2x_tpl(64) +_threefry4x_tpl(64) +#endif +_threefry2x_tpl(32) +_threefry4x_tpl(32) + +/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better + than a static inline function. Why? */ +#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k) +#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k) +#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k) +#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k) + +#if defined(__cplusplus) +#define _threefryNxWclass_tpl(NxW) \ +namespace r123{ \ +template \ + struct Threefry##NxW##_R{ \ + typedef threefry##NxW##_ctr_t ctr_type; \ + typedef threefry##NxW##_key_t key_type; \ + typedef threefry##NxW##_key_t ukey_type; \ + static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \ + R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \ + return threefry##NxW##_R(ROUNDS, ctr, key); \ + } \ +}; \ + typedef Threefry##NxW##_R Threefry##NxW; \ +} // namespace r123 + +_threefryNxWclass_tpl(2x32) +_threefryNxWclass_tpl(4x32) +#if R123_USE_64BIT +_threefryNxWclass_tpl(2x64) +_threefryNxWclass_tpl(4x64) +#endif + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup ThreefryNxW Threefry Classes and Typedefs + +The ThreefryNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +Parallel Random Numbers: As Easy as 1, 2, 3 , +the Threefry family is closely related to the Threefish block cipher from + Skein Hash Function. +Threefry is \b not suitable for cryptographic use. + +Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output. + +@class r123::Threefry2x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=13 or more for Threefry2x32. + +@typedef r123::Threefry2x32 +@ingroup ThreefryNxW + Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds, + Threefry2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + +@class r123::Threefry2x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +In November 2011, the authors discovered that 13 rounds of +Threefry2x64 sequenced by strided, interleaved key and counter +increments failed a very long (longer than the default BigCrush +length) WeightDistrub test. At the same time, it was confirmed that +14 rounds passes much longer tests (up to 5x10^12 samples) of a +similar nature. The authors know of no statistical flaws with +ROUNDS=14 or more for Threefry2x64. + +@typedef r123::Threefry2x64 +@ingroup ThreefryNxW + Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds, + Threefry2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x32. + +@typedef r123::Threefry4x32 +@ingroup ThreefryNxW + Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds, + Threefry4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x64. + +@typedef r123::Threefry4x64 +@ingroup ThreefryNxW + Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds, + Threefry4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif + +#endif diff --git a/external/panphasia_ho/uniform_rand_threefry4x64.c b/external/panphasia_ho/uniform_rand_threefry4x64.c new file mode 100644 index 0000000..0723164 --- /dev/null +++ b/external/panphasia_ho/uniform_rand_threefry4x64.c @@ -0,0 +1,1033 @@ +#include "threefry.h" +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define NUMBER_THREEFRY_ROUNDS 20 + +#include +#include +#include +#include +#include +#include "PAN_FFTW3.h" +#include "panphasia_functions.h" + +threefry4x64_ctr_t arj_threefry4x64(size_t,threefry4x64_ctr_t,threefry4x64_key_t); + +threefry4x64_ctr_t inverse_arj_threefry4x64(size_t,threefry4x64_ctr_t,threefry4x64_key_t); + + +const threefry4x64_key_t key_constant = {{0x0, 0x0, 0x0, 0x0}};// Set key to zero for Panphasia field + + +threefry4x64_key_t panphasia_key; + +int panphasia_key_initialised = -1; + +extern const unsigned long long int p_order; +extern const int Nbasis; + +extern const int parent_info[]; +extern const int new_randoms[]; + +extern int verbose_warnings_only; + + + + + +// Global variables recording descriptor parameters ////////////// + +extern size_t descriptor_order; +extern size_t descriptor_base_level; +extern size_t descriptor_xorigin,descriptor_yorigin,descriptor_zorigin; +extern size_t descriptor_base_size; +extern size_t descriptor_kk_limit; +extern long long int descriptor_check_digit; +extern char descriptor_name[]; +extern char full_descriptor[]; + +extern size_t descriptor_read_in; + +// Record relative coordinates for a particular descriptor + +extern size_t rel_level; +extern size_t rel_origin_x, rel_origin_y, rel_origin_z; +extern size_t rel_coord_max; +//////////////////////////////////////////////////////////////////// + +PAN_REAL root_cell_parent_info[300]; // Able to store up to 10th order +size_t root_cell_initialised = 0; + + +void threefry4x64_test_(int verbose) +{ + +int pass = 0; + + /* Test three examples taken from file kat_vectors supplied with version 1.09 */ + + { threefry4x64_ctr_t ctr = {{0x0000000000000000 , 0x0000000000000000 , 0x0000000000000000 , 0x0000000000000000 } }; ; + threefry4x64_key_t key = {{0x0000000000000000 , 0x0000000000000000 , 0x0000000000000000 , 0x0000000000000000 } }; + threefry4x64_ctr_t result = {{0x09218ebde6c85537, 0x55941f5266d86105, 0x4bd25e16282434dc ,0xee29ec846bd2e40b } }; + threefry4x64_ctr_t rand; + + rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS, ctr, key); + if ( (rand.v[0]!=result.v[0]) || (rand.v[1]!=result.v[1]) || (rand.v[2]!=result.v[2]) || (rand.v[3]!=result.v[3])){ + printf("Serious error occured !!!!!!!!!! Random generator is not working correctly \n"); + printf("Random generated: %llx %llx %llx %llx\n",rand.v[0],rand.v[1],rand.v[2],rand.v[3]); + printf("Random expected: %llx %llx %llx %llx\n",result.v[0],result.v[1],result.v[2],result.v[3]); + //abort(); + } else pass++; + } + + + { threefry4x64_ctr_t ctr = {{0xffffffffffffffff , 0xffffffffffffffff , 0xffffffffffffffff , 0xffffffffffffffff } }; ; + threefry4x64_key_t key = {{0xffffffffffffffff , 0xffffffffffffffff , 0xffffffffffffffff , 0xffffffffffffffff } }; ; + threefry4x64_ctr_t result = {{0x29c24097942bba1b, 0x0371bbfb0f6f4e11, 0x3c231ffa33f83a1c ,0xcd29113fde32d168 } }; + threefry4x64_ctr_t rand; + + rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS, ctr, key); + + if ( (rand.v[0]!=result.v[0]) || (rand.v[1]!=result.v[1]) || (rand.v[2]!=result.v[2]) || (rand.v[3]!=result.v[3])){ + printf("Serious error occured !!!!!!!!!! Random generator is not working correctly \n"); + printf("Random generated: %llx %llx %llx %llx\n",rand.v[0],rand.v[1],rand.v[2],rand.v[3]); + printf("Random expected: %llx %llx %llx %llx\n",result.v[0],result.v[1],result.v[2],result.v[3]); + //abort(); + }else pass++; + } + + { threefry4x64_ctr_t ctr = {{0x243f6a8885a308d3 , 0x13198a2e03707344 , 0xa4093822299f31d0 , 0x082efa98ec4e6c89 } }; ; + threefry4x64_key_t key = {{0x452821e638d01377 , 0xbe5466cf34e90c6c , 0xbe5466cf34e90c6c , 0xc0ac29b7c97c50dd} }; ; + threefry4x64_ctr_t result = {{0xa7e8fde591651bd9, 0xbaafd0c30138319b , 0x84a5c1a729e685b9, 0x901d406ccebc1ba4} }; ; + threefry4x64_ctr_t rand; + + rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS, ctr, key); + + if ( (rand.v[0]!=result.v[0]) || (rand.v[1]!=result.v[1]) || (rand.v[2]!=result.v[2]) || (rand.v[3]!=result.v[3])){ + printf("Serious error occured !!!!!!!!!! Random generator is not working correctly \n"); + printf("Random generated: %llx %llx %llx %llx\n",rand.v[0],rand.v[1],rand.v[2],rand.v[3]); + printf("Random expected: %llx %llx %llx %llx\n",result.v[0],result.v[1],result.v[2],result.v[3]); + //abort(); + }else pass++; + } + + + + if ((verbose)&&(pass==3)){ + printf("***************************************************\n"); + printf("* Basic test of threefry4x64 generator successful *\n"); + printf("***************************************************\n"); + }; + + if (NUMBER_THREEFRY_ROUNDS != 20){ + for (int i=0; i<10;i++) printf("WARNING: ***************************************************\n"); + printf("WARNING: number of threefry4x64 rounds set to %d\n",NUMBER_THREEFRY_ROUNDS); + for (int i=0; i<10;i++) printf("WARNING: ***************************************************\n"); + }; + return; +} + + +void set_panphasia_key_(int verbose) +{ + panphasia_key = key_constant; + + verbose = 0; //ARJ + + if (verbose) printf("Setting the threefry4x64 key to\n(%0llx %0llx %0llx %0llx)\n\n", + panphasia_key.v[0],panphasia_key.v[1],panphasia_key.v[2],panphasia_key.v[3]); + panphasia_key_initialised = 999; + + size_t level,j1,j2,j3; + PAN_REAL unif_randoms[8*Nbasis]; + PAN_REAL gauss_randoms[8*Nbasis]; + PAN_REAL legendre_randoms[8*Nbasis]; + PAN_REAL parent[Nbasis]; // Should not be used // + + + // Select special pair of values to turn on function in return_uniform_pseudo_rands_threefry4x64_ + size_t seed_value = 1000000000999; + size_t allow_non_zero_seed_saftey_catch=1002003004005006007; + + + for (int i=0; i63)){ + printf("Level %llu is out of range (0-63)!\n",l); // Not part of Panphasia + abort(); + }; + + if ((j1>>l!=0)||(j2>>l!=0)||(j3>>l!=0)){ // Cell outside of Panphasia + printf("Level %llu: Cell coordinate out of range (%llu,%llu,%llu)\n",l,j1,j2,j3); + abort(); + }; + + // Only allow a non-zero value for the seed if the safety catch has a specific value + + if (allow_non_zero_seed_saftey_catch != 1002003004005006007){ + seed_value = 0; + }; + + size_t root_cell_calculation=0; + //============================================================================= + // Exception - for computing the parent properties of the root cell only + //============================================================================= + if ((allow_non_zero_seed_saftey_catch == 1002003004005006007)&&(seed_value == 1000000000999)){ + l=0; + j0 = (p_order<<60); + j1 = 2; + j2 = 2; + j3 = 2; + if (p_order>8){printf("Multipole order too high\n");abort();}; + seed_value = 0; + root_cell_calculation = 1; //Signal root cell properties are being calculated + }; + //=================================================== + + + if (seed_value>>32!=0){ + printf("Seed value %llu, outside range 0 <= seed <2^32 \n",seed_value); + abort(); + }; + + // END ERROR CHECKING // + + + int nloop = Nbasis; // Generate eight uniform randoms per call of Threefry4x64 // + + + size_t k0,k1,k2,k3; + + j0 = (p_order<<60) + ((l<<56)>>4) + ((seed_value<<32)>>12); + + k0 = j0; + k1 = j1; + k2 = j2; + k3 = j3; + + if ((root_cell_calculation)&&(verbose_warnings_only!=1)){ + printf("============================================================================================\n"); + printf("Computing root cell properties\n"); + printf("p_order, l, seed_value: (j0,j1,j2,j3),%llx %llx %llx (%llx,%llx,%llx,%llx)\n", + p_order,l,seed_value,j0,j1,j2,j3); + printf("Encoded root cell values:(k0,k1,k2,k3):\n (%llx,%llx,%llx,%llx)\n",k0,k1,k2,k3); + printf("============================================================================================\n"); + }; + + + + ctr.v[0] = k0; + ctr.v[1] = k1; + ctr.v[2] = k2; + ctr.v[3] = k3; + + + ncount = 0; + + + + for(i=0; i>32; + out_int[1] = (rand.v[0]<<32)>>32; + + out_int[2] = rand.v[1]>>32; + out_int[3] = (rand.v[1]<<32)>>32; + + out_int[4] = rand.v[2]>>32; + out_int[5] = (rand.v[2]<<32)>>32; + + out_int[6] = rand.v[3]>>32; + out_int[7] = (rand.v[3]<<32)>>32; + + + + for (j=0; j<8;++j) unif_real[ncount++] = ( ((double)out_int[j] + g_shift)*g_scale); + + }; + + + + for (i=0; i<8*Nbasis; i++) panphasia_randoms[i] = unif_real[i]; + + + // Exceptional branch with the aim ultimately of filling the Gaussian tail. + // Executed rarely so does not need to be particularly efficient. + // For this reason it include an error check. Can the value + // that triggered this loop be reproduced? If it cannot, the code aborts. + + size_t branch_value = 4096; + PAN_REAL branching_ratio = ( ((double)branch_value)*g_scale); + + //PAN_REAL branching_ratio = -0.3; + + for (size_t i=0; i<8*Nbasis; i+=2) if (panphasia_randoms[i]>4) + ((seed_value<<32)>>12); + //code_cell(j0,j1,j2,j3,&k0,&k1,&k2,&k3); + + k0 = j0; + k1 = j1; + k2 = j2; + k3 = j3; + + ctr.v[0] = k0+iind; + ctr.v[1] = k1; + ctr.v[2] = k2; + ctr.v[3] = k3; + + // ctr.v[0] = k0+iind*increment; + // ctr.v[1] = k1+iind*increment; + //ctr.v[2] = k2+iind*increment; + //ctr.v[3] = k3+iind*increment; + + rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS ,ctr, panphasia_key); + + out_int[0] = rand.v[0]>>32; + out_int[1] = (rand.v[0]<<32)>>32; + + out_int[2] = rand.v[1]>>32; + out_int[3] = (rand.v[1]<<32)>>32; + + out_int[4] = rand.v[2]>>32; + out_int[5] = (rand.v[2]<<32)>>32; + + out_int[6] = rand.v[3]>>32; + out_int[7] = (rand.v[3]<<32)>>32; + + new_value = ( ((double)out_int[jind] + g_shift)*g_scale); + + + if (loop==0){ + if (new_value != panphasia_randoms[i]){ + printf("Failure to reproduce the initial random that triggered this branch - a serious error!\n"); + abort(); + }}else{ + + if (new_value>=branching_ratio){ + + replacement_value *= new_value; + }else{ + + size_t counter=0; + + while ((new_value + +long long int compute_check_digit_(){ + + char str[200]; + long long int check_digit; + + threefry4x64_ctr_t ctr, rand; + threefry4x64_key_t key; + + if (descriptor_read_in==0){ + printf("No descriptor has been set\n"); + abort(); + }; + + + sprintf(str, "%llu%llu%llu%llu%llu%llu%llu%s",descriptor_order,descriptor_base_level, + descriptor_xorigin,descriptor_yorigin,descriptor_zorigin, + descriptor_base_size,descriptor_kk_limit,descriptor_name); + + key = key_constant; + + ctr.v[0] = 0; + ctr.v[1] = 0; + ctr.v[2] = 0; + ctr.v[3] = 0; + + + for (int i =0; i>32); + + return(check_digit); +}; + + +////////////////////////////////////////////////////////////////////////////// +// Construct pairs of overlapping random descriptors for testing +// moments of a cell and its child eight child cells are +// essentially identical. +////////////////////////////////////////////////////////////////////////////// + + +void test_propogation_of_moments_(int iterations) +{ + + + + const int level_max=62; + + + threefry4x64_ctr_t ctr, rand; + threefry4x64_key_t key; + + + key = key_constant; + + ctr.v[0] = 0; + ctr.v[1] = 0; + ctr.v[2] = 0; + ctr.v[3] = 0; + + int levplus=1; + if (iterations==0){ + iterations = 1; + levplus = 5; + + }; + + for(int it=0; it0){ + level_cell = level_desc1 + ctr.v[0]%(level_max-level_desc1); + } + else{ + level_cell = level_desc1; + }; + + if (level_cell-level_desc1>0){ + level_desc2 = level_desc1 + ctr.v[1]%(level_cell-level_desc1); + + }else{ + level_desc2 = level_desc1; + }; + + + size_t side_length2 = (size_t)1<>(64-level_cell); + size_t ycell = ctr.v[1]>>(64-level_cell); + size_t zcell = ctr.v[2]>>(64-level_cell); + + size_t cell_level_size = (size_t)1<>(level_cell-level_desc1))-dx1+side_length1)%side_length1; + size_t desc1_y = ((ycell>>(level_cell-level_desc1))-dy1+side_length1)%side_length1; + size_t desc1_z = ((zcell>>(level_cell-level_desc1))-dz1+side_length1)%side_length1; + + size_t desc2_x = ((xcell>>(level_cell-level_desc2))-dx2+side_length2)%side_length2; + size_t desc2_y = ((ycell>>(level_cell-level_desc2))-dy2+side_length2)%side_length2; + size_t desc2_z = ((zcell>>(level_cell-level_desc2))-dz2+side_length2)%side_length2; + + + +char descriptor1[300]; +char descriptor2[300]; + + sprintf(descriptor1,"[Panph%ld,L%ld,(%llu,%llu,%llu),S%llu,CH-999,test]", + p_order,level_desc1,desc1_x,desc1_y,desc1_z,desc1_s); + + sprintf(descriptor2,"[Panph%ld,L%ld,(%llu,%llu,%llu),S%llu,CH-999,test]", + p_order,level_desc2,desc2_x,desc2_y,desc2_z,desc2_s); + + //printf("Descriptor 1: %s\nDescriptor 2: %s\n",descriptor1,descriptor2); + + + rand = threefry4x64(ctr, key); ctr = rand; + + + size_t rel_level1 = level_cell - level_desc1; + size_t rel_level2 = level_cell - level_desc2; + + size_t xstart1 = (xcell-(desc1_x<max_diff2) max_diff2=diff2; + rms_diff2+=diff2; + }; + + // printf("%s\n%s\n",descriptor1,descriptor2); + + //printf("Example coeff %18.12lf %18.12lf \n",coefficients1[0],coefficients2[0]); + + rms_diff2/=(double)Nbasis; + + + if ((sizeof(PAN_REAL)==4)||(sizeof(FFTW_REAL)==4)){ + + if ((max_diff2>1.e-12)||(rms_diff2>1.e-12)){ + printf("Moments not accurately recovered at single precision\n"); abort(); + }; + + }else{ + + if ((max_diff2>1.e-24)||(rms_diff2>1.e-24)){ + printf("Moments not accurately recovered at double precision\n"); abort(); + }; + + }; + + // printf("lev %d Acceptable differences: %e RMS difference %e\n",lev,sqrt(max_diff2),sqrt(rms_diff2)); + + }; + + // printf("Test of descriptors/relative coordinates and moments PASSED.\n"); + }; +}; + + +///////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////// +// Alternative Threefry4x64 generator and inverse function - for testing +// purposes only +///////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////// + + +void inverse_threefry4x64_test_(int verbose) +{ + +threefry4x64_ctr_t ctr = {{0x243f6a8885a308d3 , 0x13198a2e03707344 , 0xa4093822299f31d0 , 0x082efa98ec4e6c89 } }; ; +threefry4x64_key_t key = {{0x452821e638d01377 , 0xbe5466cf34e90c6c , 0xbe5466cf34e90c6c , 0xc0ac29b7c97c50dd} }; ; +threefry4x64_ctr_t rand1,rand2; + + for (size_t ROUNDS = 0; ROUNDS<21; ROUNDS++){ + + rand1 = threefry4x64_R(ROUNDS, ctr, key); + + rand2 = arj_threefry4x64(ROUNDS, ctr, key); + + if( (rand1.v[0]!=rand2.v[0])||(rand1.v[1]!=rand2.v[1])||(rand1.v[2]!=rand2.v[2])||(rand1.v[3]!=rand2.v[3])){ + printf("Error in arj_threefry4x64 - failing to reproduce Threefry4x64 generator!!!\n"); + abort(); + }; + + rand2 = inverse_arj_threefry4x64(ROUNDS, rand1, key); + + if( (ctr.v[0]!=rand2.v[0])||(ctr.v[1]!=rand2.v[1])||(ctr.v[2]!=rand2.v[2])||(ctr.v[3]!=rand2.v[3])){ + printf("Error in arj_threefry4x64 - failing to reproduce INVERSE Threefry4x64 generator!!!\n"); + abort(); + }; + + + + }; + + return; +} + + +threefry4x64_ctr_t arj_threefry4x64(size_t R,threefry4x64_ctr_t ctr, + threefry4x64_key_t key){ + size_t x0 = ctr.v[0]; size_t x1 = ctr.v[1]; size_t x2 = ctr.v[2]; size_t x3 = ctr.v[3]; + size_t k0 = key.v[0]; size_t k1 = key.v[1]; size_t k2 = key.v[2]; size_t k3 = key.v[3]; + size_t k4 = 0x1bd11bdaa9fc1a22; +//--------------------------------------- + + + if (R>20) abort(); + +k4^=k0; k4^=k1; k4^=k2; k4^=k3; +x0+=k0;x1+=k1;x2+=k2;x3+=k3; + + +if (R>0){ + x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; + x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; + }; +if (R>1){ + x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; + x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; +}; + +if (R>2){ + x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; + x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; +}; + +if (R>3){ + x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; + x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; + //Inject key 1 + x0+=k1; x1+=k2; x2+=k3; x3+=k4; x3+=1; +}; + +if (R>4){ + x0+=x1; x1 = (x1<<25)|(x1>>39); x1^=x0; + x2+=x3; x3 = (x3<<33)|(x3>>31); x3^=x2; +}; + +if (R>5){ + x0+=x3; x3 = (x3<<46)|(x3>>18); x3^=x0; + x2+=x1; x1 = (x1<<12)|(x1>>52); x1^=x2; +}; + +if (R>6){ + x0+=x1; x1 = (x1<<58)|(x1>> 6); x1^=x0; + x2+=x3; x3 = (x3<<22)|(x3>>42); x3^=x2; +}; + +if (R>7){ + x0+=x3; x3 = (x3<<32)|(x3>>32); x3^=x0; + x2+=x1; x1 = (x1<<32)|(x1>>32); x1^=x2; + //Inject key 2 + x0+=k2; x1+=k3; x2+=k4; x3+=k0; x3+=2; +}; + +if (R>8){ + x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; + x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; +}; + +if (R>9){ + x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; + x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; +}; + +if (R>10){ + x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; + x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; +}; + +if (R>11){ + x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; + x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; + //Inject key 3 + x0+=k3; x1+=k4; x2+=k0; x3+=k1; x3+=3; +}; + +if (R>12){ + x0+=x1; x1 = (x1<<25)|(x1>>39); x1^=x0; + x2+=x3; x3 = (x3<<33)|(x3>>31); x3^=x2; +}; + +if (R>13){ + x0+=x3; x3 = (x3<<46)|(x3>>18); x3^=x0; + x2+=x1; x1 = (x1<<12)|(x1>>52); x1^=x2; +}; + +if (R>14){ + x0+=x1; x1 = (x1<<58)|(x1>> 6); x1^=x0; + x2+=x3; x3 = (x3<<22)|(x3>>42); x3^=x2; +}; + +if (R>15){ + x0+=x3; x3 = (x3<<32)|(x3>>32); x3^=x0; + x2+=x1; x1 = (x1<<32)|(x1>>32); x1^=x2; + //Inject key 4 + x0+=k4; x1+=k0; x2+=k1; x3+=k2; x3+=4; +}; + +if (R>16){ + x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; + x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; +}; + +if (R>17){ + x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; + x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; +}; +if (R>18){ + x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; + x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; +}; +if (R>19){ + x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; + x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; + //Inject key 5 + x0+=k0; x1+=k1; x2+=k2; x3+=k3; x3+=5; +}; +//--------------------------------------- + threefry4x64_ctr_t result = {{x0,x1,x2,x3}}; + return(result); + +}; + +threefry4x64_ctr_t inverse_arj_threefry4x64(size_t R,threefry4x64_ctr_t ctr, + threefry4x64_key_t key){ + + size_t x0 = ctr.v[0]; size_t x1 = ctr.v[1]; size_t x2 = ctr.v[2]; size_t x3 = ctr.v[3]; + size_t k0 = key.v[0]; size_t k1 = key.v[1]; size_t k2 = key.v[2]; size_t k3 = key.v[3]; + size_t k4 = 0x1bd11bdaa9fc1a22; +//--------------------------------------- + +if (R>20) abort(); + + k4^=k0; k4^=k1; k4^=k2; k4^=k3; + +if (R>19){ + //Anti-inject key 5 + x0-=k0; x1-=k1; x2-=k2; x3-=k3; x3-=5; + x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; + x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; +}; + +if (R>18){ + x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; + x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; +}; + +if (R>17){ + x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; + x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; +}; + +if (R>16){ + x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; + x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; +}; + +if (R>15){ + //Anti-inject key 4 + x0-=k4; x1-=k0; x2-=k1; x3-=k2; x3-=4; + x3^=x0; x3 = (x3<<32)|(x3>>32); x0-=x3; + x1^=x2; x1 = (x1<<32)|(x1>>32); x2-=x1; +}; + +if (R>14){ + x3^=x2; x3 = (x3<<42)|(x3>>22); x2-=x3; + x1^=x0; x1 = (x1<< 6)|(x1>>58); x0-=x1; +}; + +if (R>13){ + x3^=x0; x3 = (x3<<18)|(x3>>46); x0-=x3; + x1^=x2; x1 = (x1<<52)|(x1>>12); x2-=x1; +}; + +if (R>12){ + x3^=x2; x3 = (x3<<31)|(x3>>33); x2-=x3; + x1^=x0; x1 = (x1<<39)|(x1>>25); x0-=x1; +}; + +if (R>11){ + //Anti-inject key 3 + x0-=k3; x1-=k4; x2-=k0; x3-=k1; x3-=3; + x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; + x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; + +}; + +if (R>10){ + x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; + x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; +}; + +if (R>9){ + x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; + x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; +}; + +if (R>8){ + x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; + x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; +}; + +if (R>7){ + //Anti-inject key 2 + x0-=k2; x1-=k3; x2-=k4; x3-=k0; x3-=2; + x3^=x0; x3 = (x3<<32)|(x3>>32); x0-=x3; + x1^=x2; x1 = (x1<<32)|(x1>>32); x2-=x1; + }; + +if (R>6){ + x3^=x2; x3 = (x3<<42)|(x3>>22); x2-=x3; + x1^=x0; x1 = (x1<< 6)|(x1>>58); x0-=x1; +}; + +if (R>5){ + x3^=x0; x3 = (x3<<18)|(x3>>46); x0-=x3; + x1^=x2; x1 = (x1<<52)|(x1>>12); x2-=x1; +}; + +if (R>4){ + x3^=x2; x3 = (x3<<31)|(x3>>33); x2-=x3; + x1^=x0; x1 = (x1<<39)|(x1>>25); x0-=x1; +}; + + if (R>3){ + //Anti-inject key 1 + x0-=k1; x1-=k2; x2-=k3; x3-=k4; x3-=1; + x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; + x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; +}; + +if (R>2){ + x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; + x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; +}; + + +if (R>1){ + x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; + x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; +}; + +if (R>0){ + x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; + x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; + }; + + // Anti-start + x0-=k0; x1-=k1; x2-=k2; x3-=k3; + + +//--------------------------------------- + threefry4x64_ctr_t result = {{x0,x1,x2,x3}}; + return(result); +}; + + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// From f7c9d606f7e538803ba05bd842307740b8524b51 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Sun, 2 May 2021 22:52:21 +0200 Subject: [PATCH 02/25] modified to have a minimally compilable PANPHASIA_HO example inside monofonic --- CMakeLists.txt | 17 + example.conf | 8 +- .../high_order_panphasia_routines.c | 2282 +++++++++-------- external/panphasia_ho/main.c | 135 +- src/plugins/random_panphasia_ho.cc | 83 + 5 files changed, 1334 insertions(+), 1191 deletions(-) create mode 100644 src/plugins/random_panphasia_ho.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 02dd3dc..7b6b30f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,6 +149,9 @@ elseif("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU") endif() endif(ENABLE_PANPHASIA) ######################################################################################################################## +# PANPHASIA HO (High-Order, new version) +option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON) +######################################################################################################################## # INCLUDES include_directories(${PROJECT_SOURCE_DIR}/include) @@ -173,6 +176,16 @@ list (APPEND SOURCES ) endif() +if(ENABLE_PANPHASIA_HO) +list (APPEND SOURCES + ${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c + ${PROJECT_SOURCE_DIR}/external/panphasia_ho/high_order_panphasia_routines.c + ${PROJECT_SOURCE_DIR}/external/panphasia_ho/pan_mpi_routines.c + ${PROJECT_SOURCE_DIR}/external/panphasia_ho/uniform_rand_threefry4x64.c +) +# target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho) +endif() + # project configuration header configure_file( ${PROJECT_SOURCE_DIR}/include/cmake_config.hh.in @@ -246,6 +259,10 @@ if(ENABLE_PANPHASIA) target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA") endif(ENABLE_PANPHASIA) +if(ENABLE_PANPHASIA_HO) + target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO") +endif(ENABLE_PANPHASIA_HO) + if(ENABLE_PLT) target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT") endif(ENABLE_PLT) diff --git a/example.conf b/example.conf index 92f438e..00abeeb 100644 --- a/example.conf +++ b/example.conf @@ -89,15 +89,15 @@ ztarget = 2.5 # target redshift for CLASS module, output at ##> NGenIC compatible random number generator module compatible with V. Springel's original code ## (https://www.h-its.org/2014/11/05/ngenic-code/) as well as the 2LPT code by Pueblas&Scoccmiarro ## (https://cosmo.nyu.edu/roman/2LPT/) -generator = NGENIC -seed = 12345 +# generator = NGENIC +# seed = 12345 ##> The PANPHASIA generator uses a plugin based on original code by A. Jenkins ## Warning: Before using this module, please make sure you read and agree to the distinct license ## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php -# generator = PANPHASIA -# descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL] +generator = PANPHASIA_HO +descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL] # PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512) ##> The MUSIC1 multi-scale random number generator is provided for convenience diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c index 1e17816..d1d369b 100644 --- a/external/panphasia_ho/high_order_panphasia_routines.c +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -13,7 +13,7 @@ #include #endif -int verbose_warnings_only=0; +int verbose_warnings_only = 0; static int start_panph_method = 0; static int panphasia_rel_origin_set = 0; @@ -21,14 +21,14 @@ static int panphasia_rel_origin_set = 0; size_t descriptor_order; size_t descriptor_base_level; -size_t descriptor_xorigin,descriptor_yorigin,descriptor_zorigin; +size_t descriptor_xorigin, descriptor_yorigin, descriptor_zorigin; size_t descriptor_base_size; size_t descriptor_kk_limit; -long long int descriptor_check_digit; +long long int descriptor_check_digit; char descriptor_name[100]; char full_descriptor[300]; -size_t descriptor_read_in; +size_t descriptor_read_in; // Record relative coordinates for a particular descriptor @@ -36,7 +36,6 @@ size_t rel_level; size_t rel_origin_x, rel_origin_y, rel_origin_z; size_t rel_coord_max; - ///////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// @@ -71,73 +70,69 @@ size_t rel_coord_max; #include "pan_matrices_order6.h" #endif - - - - -void solve_panphasia_cell_(PAN_REAL * input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_vec_children, int control_flag) +void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_vec_children, int control_flag) { - int iparity, iconst, irow, l,i,j; + int iparity, iconst, irow, l, i, j; PAN_REAL element; PAN_REAL const norm = sqrt(0.125); PAN_REAL parent_constraint[Nbasis]; //__assume_aligned(&parent_constraint, 64); - PAN_REAL proj_constraint[Nbasis]; //__assume_aligned(&proj_constraint, 64); - PAN_REAL work_vec1[8*Nbasis]; //__assume_aligned(&work_vec1, 64); - PAN_REAL work_vec2[8*Nbasis]; //__assume_aligned(&work_vec2, 64); + PAN_REAL proj_constraint[Nbasis]; //__assume_aligned(&proj_constraint, 64); + PAN_REAL work_vec1[8 * Nbasis]; //__assume_aligned(&work_vec1, 64); + PAN_REAL work_vec2[8 * Nbasis]; //__assume_aligned(&work_vec2, 64); + //=========================================================================== + // Copy inputs and rearrange parent constraints in parity order and set proj_constraint to zero -//=========================================================================== -// Copy inputs and rearrange parent constraints in parity order and set proj_constraint to zero + for (i = 0; i < 8 * Nbasis; i++) + work_vec1[i] = input_vec_children[i]; - for (i=0; i<8*Nbasis; i++) work_vec1[i]=input_vec_children[i]; - - for (i=0; i17) { //ARJ for testing purposes only - // for(int i=0; i<8*Nbasis;i++)gauss_rand_children[i]=0; //ARJ for testing purposes only - // for(int i=56; i17) { //ARJ for testing purposes only + // for(int i=0; i<8*Nbasis;i++)gauss_rand_children[i]=0; //ARJ for testing purposes only + // for(int i=56; i +int demo_descriptor_() +{ - -int demo_descriptor_(){ - - - char str[200] = "[Panph6,L11,(2043,2045,2046),S5,CH-999,Testing_only]"; // xyz - + char str[200] = "[Panph6,L11,(2043,2045,2046),S5,CH-999,Testing_only]"; // xyz //char str[200] = "[Panph6,L3,(2,3,4),S8,CH-999,Testing_only]"; // xyz //char str[200] = "[Panph6,L3,(4,2,3),S8,CH-999,Testing_only]"; // xyz //char str[200] = "[Panph6,L3,(3,4,5),S8,CH-999,Testing_only]"; // xyz - // char str[200] = "[Panph6,L56,(0,0,31),S5,CH-999,Testing_only]"; - // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH2414478110,Auriga_volume2]"; + // char str[200] = "[Panph6,L56,(0,0,31),S5,CH-999,Testing_only]"; + // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH2414478110,Auriga_volume2]"; // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH-999,Auriga_volume2]"; - char copy[200]; - const char s[20] = "[,L,(),S,CH,]"; - char *token; + char copy[200]; + const char s[20] = "[,L,(),S,CH,]"; + char *token; + size_t desc_level, desc_x, desc_y, desc_z, desc_size; + long long int desc_ch; + char desc_name[100]; + char desc_iden[8]; + int error_code; - size_t desc_level, desc_x, desc_y, desc_z,desc_size; - long long int desc_ch; - char desc_name[100]; - char desc_iden[8]; - int error_code; + descriptor_read_in = 0; - descriptor_read_in = 0; + if (error_code = parse_and_validate_descriptor_(str)) + { - if (error_code = parse_and_validate_descriptor_(str)){ + printf("Invalid descriptor %s\n", str); + printf("Descriptor error code %d\n", error_code); + } + else + { + printf("Valid descriptor parsed %s\n", str); + }; - printf("Invalid descriptor %s\n",str); - printf("Descriptor error code %d\n",error_code); - } else { - printf("Valid descriptor parsed %s\n",str); - }; + if (descriptor_read_in) + { + printf("-----------------------------------------\n"); + printf("Descriptor order: %llu\n", descriptor_order); + printf("Descriptor base level: %llu\n", descriptor_base_level); + printf("Descriptor x-origin: %llu\n", descriptor_xorigin); + printf("Descriptor y-origin: %llu\n", descriptor_yorigin); + printf("Descriptor z-origin: %llu\n", descriptor_zorigin); + printf("Descriptor base size: %llu\n", descriptor_base_size); + printf("Descriptor check digit:%lld\n", descriptor_check_digit); + printf("Descriptor name %s\n", descriptor_name); + printf("-----------------------------------------\n"); + printf("Descriptor %s\n", full_descriptor); + printf("-----------------------------------------\n"); - if (descriptor_read_in){ - printf("-----------------------------------------\n"); - printf("Descriptor order: %llu\n",descriptor_order); - printf("Descriptor base level: %llu\n",descriptor_base_level); - printf("Descriptor x-origin: %llu\n",descriptor_xorigin); - printf("Descriptor y-origin: %llu\n",descriptor_yorigin); - printf("Descriptor z-origin: %llu\n",descriptor_zorigin); - printf("Descriptor base size: %llu\n",descriptor_base_size); - printf("Descriptor check digit:%lld\n",descriptor_check_digit); - printf("Descriptor name %s\n", descriptor_name); - printf("-----------------------------------------\n"); - printf("Descriptor %s\n",full_descriptor); - printf("-----------------------------------------\n"); + printf("Check digit %lld\n", compute_check_digit_()); + }; - printf("Check digit %lld\n",compute_check_digit_()); - }; + int verbose = 0; + int flag_output_mode = 0; + PANPHASIA_init_descriptor_(str, &verbose); - int verbose=0; - int flag_output_mode=0; - PANPHASIA_init_descriptor_(str,&verbose); + size_t rel_lev = 3; - size_t rel_lev = 3; + size_t rel_orig_x = 33; //xyz + size_t rel_orig_y = 17; + size_t rel_orig_z = 9; - size_t rel_orig_x = 33; //xyz - size_t rel_orig_y = 17; - size_t rel_orig_z = 9; + //size_t rel_orig_x = 9; //zxy + //size_t rel_orig_y = 33; + //size_t rel_orig_z = 17; - //size_t rel_orig_x = 9; //zxy - //size_t rel_orig_y = 33; - //size_t rel_orig_z = 17; + // size_t rel_orig_x = 0; + // size_t rel_orig_y = 0; + // size_t rel_orig_z = 0; + verbose = 0; - // size_t rel_orig_x = 0; - // size_t rel_orig_y = 0; - // size_t rel_orig_z = 0; - - verbose = 0; + if (error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose)) + { + printf("Error %d in initialing PANPHASIA_init_level_\n", + error_code); + return (error_code); + }; - if (error_code = PANPHASIA_init_level_(&rel_lev, - &rel_orig_x,&rel_orig_y,&rel_orig_z,&verbose)){ - printf("Error %d in initialing PANPHASIA_init_level_\n", - error_code); - return(error_code); - }; + size_t xstart = 3, ystart = 5, zstart = 4; + size_t xextent = 27, yextent = 29, zextent = 40; // xyz + // size_t xstart = 4, ystart = 3, zstart = 5; + // size_t xextent = 40, yextent = 27, zextent=29; - size_t xstart = 3, ystart = 5, zstart = 4; - size_t xextent = 27, yextent = 29, zextent=40; // xyz + // size_t xstart = 0, ystart = 0, zstart = 0; + // size_t xextent = 4, yextent = 4, zextent=4; + size_t copy_list[Nbasis]; + size_t ncopy = 28; - // size_t xstart = 4, ystart = 3, zstart = 5; - // size_t xextent = 40, yextent = 27, zextent=29; + PAN_REAL *output_values = malloc(sizeof(PAN_REAL) * ncopy * xextent * yextent * zextent); + if (output_values == NULL) + { + printf("Unable to allocate output_values \n"); + abort(); + }; - // size_t xstart = 0, ystart = 0, zstart = 0; - // size_t xextent = 4, yextent = 4, zextent=4; + for (int i = 0; i < Nbasis / 3; i++) + copy_list[i] = 3 * i; - size_t copy_list[Nbasis]; - size_t ncopy=28; + if (error_code = PANPHASIA_compute_coefficients_(&xstart, &ystart, &zstart, + &xextent, &yextent, &zextent, copy_list, &ncopy, + output_values, &flag_output_mode, &verbose)) + { - PAN_REAL *output_values = malloc(sizeof(PAN_REAL)*ncopy*xextent*yextent*zextent); - if (output_values==NULL){ - printf("Unable to allocate output_values \n"); - abort(); - }; + printf("Error %d in PANPHASIA_compute_coefficients \n", error_code); + return (error_code); + }; + if (xextent * yextent * zextent < 2097153) + { + FILE *file = fopen("Panphasia_sample.tex", "w"); - for (int i=0; i0 - Do N iterations of the test with 1. + // In May 2020 ran with N=8000 - all tested passed. + // This provides a good test that the doubly periodic + // boundaries (of Panphasia itself, and the region + // covered by the descriptor) are working correctly. + test_propogation_of_moments_(0); - // Complex test - testing two completely different aspects at the same time. - // The routine generates a series of random cells in the Panphasia - // field. For each random cell it then creates two random descriptors - // for this one cell, and computes the relative coordinates, and - // and relative levels for the cell. For one descriptor the cell - // itself is chosen, while for the second descriptor the children - // of the cell as chosen. The test is to compute the Nbasis - // Legendre coefficients for the random cell by directly evaluating - // the Panphasia field and using Gaussian quadrature to evaluate the - // integrals. The test is passed if the cell and its combined eight - // child cells yield the same Nbasis Legendre coefficients to single/double - // precision. This test both that the descriptor/relative coordinates - // do correctly point to the same cell, and that parent cell information is - // being accurately propograted to the child cells. - // - // The random descriptors are chosen with a minimum side length of - // 1 cell, up to the entire dimension of Panphasia at that level. - // - // The argument determines the number of tests: - // - // 0 - default fast test - choose the random cell at - // levels 0,5,10,...60. One test per level. - // Run time about 1.5 seconds. - // - // N>0 - Do N iterations of the test with 1. - // In May 2020 ran with N=8000 - all tested passed. - // This provides a good test that the doubly periodic - // boundaries (of Panphasia itself, and the region - // covered by the descriptor) are working correctly. + printf("===================================================\n"); + printf("Test of Threefry4x64 generator function - PASSED\n"); + printf("Test of inverse Threefry4x64 function - PASSED\n"); + printf("Test of propogation of moments - PASSED\n"); + printf("===================================================\n"); - - test_propogation_of_moments_(0); - - - printf("===================================================\n"); - printf("Test of Threefry4x64 generator function - PASSED\n"); - printf("Test of inverse Threefry4x64 function - PASSED\n"); - printf("Test of propogation of moments - PASSED\n"); - printf("===================================================\n"); - - panphasia_rel_origin_set = 0; // Force user to set rel origin themselves. + panphasia_rel_origin_set = 0; // Force user to set rel origin themselves. }; -int PANPHASIA_init_level_(size_t *rel_lev, - size_t *rel_orig_x, size_t *rel_orig_y, - size_t *rel_orig_z, int *verbose){ +int PANPHASIA_init_level_(size_t *rel_lev, + size_t *rel_orig_x, size_t *rel_orig_y, + size_t *rel_orig_z, int *verbose) +{ - - if (*rel_lev>63) return(101); - if (descriptor_base_level+*rel_lev>63) return (102); + if (*rel_lev > 63) + return (101); + if (descriptor_base_level + *rel_lev > 63) + return (102); - if (*rel_orig_x>=(descriptor_base_size<<*rel_lev)) return(103); - if (*rel_orig_y>=(descriptor_base_size<<*rel_lev)) return(104); - if (*rel_orig_z>=(descriptor_base_size<<*rel_lev)) return(105); + if (*rel_orig_x >= (descriptor_base_size << *rel_lev)) + return (103); + if (*rel_orig_y >= (descriptor_base_size << *rel_lev)) + return (104); + if (*rel_orig_z >= (descriptor_base_size << *rel_lev)) + return (105); // Copy to global set of relative coordinates rel_level = *rel_lev; - rel_origin_x = *rel_orig_x; - rel_origin_y = *rel_orig_y; - rel_origin_z = *rel_orig_z; - rel_coord_max= descriptor_base_size<<*rel_lev; + rel_origin_x = *rel_orig_x; + rel_origin_y = *rel_orig_y; + rel_origin_z = *rel_orig_z; + rel_coord_max = descriptor_base_size << *rel_lev; - if (*verbose){ + if (*verbose) + { printf("-----------------------------------------------------------------\n"); printf("Initialising a Panphasia subgrid\n"); - printf("Relative level %llu\n",rel_level); - printf("Relative origin (%llu,%llu,%llu)\n",rel_origin_x,rel_origin_y,rel_origin_z); - printf("The maximum possible extent of this subgrid is %llu cells\n",rel_coord_max); + printf("Relative level %llu\n", rel_level); + printf("Relative origin (%llu,%llu,%llu)\n", rel_origin_x, rel_origin_y, rel_origin_z); + printf("The maximum possible extent of this subgrid is %llu cells\n", rel_coord_max); printf("-----------------------------------------------------------------\n"); - }; - panphasia_rel_origin_set = 1; - - return(0); - + panphasia_rel_origin_set = 1; + return (0); }; +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== +//====================================================================================== +int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zstart, + size_t *xextent, size_t *yextent, size_t *zextent, + size_t *copy_list, + size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose) +{ -//====================================================================================== -//====================================================================================== -//====================================================================================== -//====================================================================================== -//====================================================================================== -int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart, - size_t *xextent, size_t *yextent, size_t *zextent, - size_t *copy_list, - size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose){ - - - size_t cumulative_cell_index[Nbasis+1]; - size_t level_max = descriptor_base_level+rel_level; + size_t cumulative_cell_index[Nbasis + 1]; + size_t level_max = descriptor_base_level + rel_level; size_t cell_memory_to_allocate; - //ticks tic_tot; - + //ticks tic_start = getticks(); - - //================== Basic error checking of input parameters ========== - if (panphasia_rel_origin_set!=1) return(200); - if (*xstart>=rel_coord_max) return(201); - if (*ystart>=rel_coord_max) return(202); - if (*zstart>=rel_coord_max) return(203); - - if ((*xextent>rel_coord_max)||(*xextent==0)) return(204); - if ((*yextent>rel_coord_max)||(*yextent==0)) return(205); - if ((*zextent>rel_coord_max)||(*zextent==0)) return(206); + if (panphasia_rel_origin_set != 1) + return (200); + if (*xstart >= rel_coord_max) + return (201); + if (*ystart >= rel_coord_max) + return (202); + if (*zstart >= rel_coord_max) + return (203); - if ((*ncopy<0)||(*ncopy>Nbasis)) return(207); - - if ((copy_list[0]<0)||(copy_list[*ncopy-1]>=Nbasis)) return(208); + if ((*xextent > rel_coord_max) || (*xextent == 0)) + return (204); + if ((*yextent > rel_coord_max) || (*yextent == 0)) + return (205); + if ((*zextent > rel_coord_max) || (*zextent == 0)) + return (206); + if ((*ncopy < 0) || (*ncopy > Nbasis)) + return (207); - for (int i=1; i<*ncopy; i++) if (copy_list[i]<=copy_list[i-1]) return(209); + if ((copy_list[0] < 0) || (copy_list[*ncopy - 1] >= Nbasis)) + return (208); - - //======================================================================= - // Allocate storage for one dimensional x,y,z cell coordinate lists - //======================================================================= + for (int i = 1; i < *ncopy; i++) + if (copy_list[i] <= copy_list[i - 1]) + return (209); -size_t nreturn_x = 2*(*xextent) + 200; -size_t nreturn_y = 2*(*yextent) + 200; -size_t nreturn_z = 2*(*zextent) + 200; + //======================================================================= + // Allocate storage for one dimensional x,y,z cell coordinate lists + //======================================================================= -size_t *ret_x_list_coords = malloc(sizeof(size_t)*nreturn_x); - if (ret_x_list_coords==NULL) return(220); -size_t *ret_y_list_coords = malloc(sizeof(size_t)*nreturn_y); - if (ret_y_list_coords==NULL) return(221); -size_t *ret_z_list_coords = malloc(sizeof(size_t)*nreturn_z); - if (ret_z_list_coords==NULL) return(222); + size_t nreturn_x = 2 * (*xextent) + 200; + size_t nreturn_y = 2 * (*yextent) + 200; + size_t nreturn_z = 2 * (*zextent) + 200; -long long int *child_pointer_x = malloc(sizeof(size_t)*2*nreturn_x); - if (child_pointer_x==NULL) return(223); -long long int *child_pointer_y = malloc(sizeof(size_t)*2*nreturn_y); - if (child_pointer_x==NULL) return(224); -long long int *child_pointer_z = malloc(sizeof(size_t)*2*nreturn_z); - if (child_pointer_z==NULL) return(225); + size_t *ret_x_list_coords = malloc(sizeof(size_t) * nreturn_x); + if (ret_x_list_coords == NULL) + return (220); + size_t *ret_y_list_coords = malloc(sizeof(size_t) * nreturn_y); + if (ret_y_list_coords == NULL) + return (221); + size_t *ret_z_list_coords = malloc(sizeof(size_t) * nreturn_z); + if (ret_z_list_coords == NULL) + return (222); -size_t level_begin_x[64],level_count_x[64]; -size_t level_begin_y[64],level_count_y[64]; -size_t level_begin_z[64],level_count_z[64]; + long long int *child_pointer_x = malloc(sizeof(size_t) * 2 * nreturn_x); + if (child_pointer_x == NULL) + return (223); + long long int *child_pointer_y = malloc(sizeof(size_t) * 2 * nreturn_y); + if (child_pointer_x == NULL) + return (224); + long long int *child_pointer_z = malloc(sizeof(size_t) * 2 * nreturn_z); + if (child_pointer_z == NULL) + return (225); -size_t *index_perm_x = malloc(sizeof(size_t)*nreturn_x); - if (index_perm_x==NULL) return(226); -size_t *index_perm_y = malloc(sizeof(size_t)*nreturn_y); - if (index_perm_y==NULL) return(226); -size_t *index_perm_z = malloc(sizeof(size_t)*nreturn_z); - if (index_perm_z==NULL) return(226); + size_t level_begin_x[64], level_count_x[64]; + size_t level_begin_y[64], level_count_y[64]; + size_t level_begin_z[64], level_count_z[64]; -size_t *list_cell_x_coord = malloc(sizeof(size_t)*(*xextent)); - if (list_cell_x_coord==NULL) return(227); -size_t *list_cell_y_coord = malloc(sizeof(size_t)*(*yextent)); - if (list_cell_y_coord==NULL) return(228); -size_t *list_cell_z_coord = malloc(sizeof(size_t)*(*zextent)); - if (list_cell_z_coord==NULL) return(229); + size_t *index_perm_x = malloc(sizeof(size_t) * nreturn_x); + if (index_perm_x == NULL) + return (226); + size_t *index_perm_y = malloc(sizeof(size_t) * nreturn_y); + if (index_perm_y == NULL) + return (226); + size_t *index_perm_z = malloc(sizeof(size_t) * nreturn_z); + if (index_perm_z == NULL) + return (226); -//================================================================ -// Make x,y,z lists of cell coordinates // -//================================================================ -{ - for (size_t i =0; i<*xextent; i++){ - size_t xabs,yabs,zabs; - calc_absolute_coordinates(*xstart+i,*ystart,*zstart,&xabs,&yabs,&zabs); + size_t *list_cell_x_coord = malloc(sizeof(size_t) * (*xextent)); + if (list_cell_x_coord == NULL) + return (227); + size_t *list_cell_y_coord = malloc(sizeof(size_t) * (*yextent)); + if (list_cell_y_coord == NULL) + return (228); + size_t *list_cell_z_coord = malloc(sizeof(size_t) * (*zextent)); + if (list_cell_z_coord == NULL) + return (229); + + //================================================================ + // Make x,y,z lists of cell coordinates // + //================================================================ + { + for (size_t i = 0; i < *xextent; i++) + { + size_t xabs, yabs, zabs; + calc_absolute_coordinates(*xstart + i, *ystart, *zstart, &xabs, &yabs, &zabs); list_cell_x_coord[i] = xabs; - }; + }; - for (size_t i =0; i<*yextent; i++){ - size_t xabs,yabs,zabs; - calc_absolute_coordinates(*xstart,*ystart+i,*zstart,&xabs,&yabs,&zabs); + for (size_t i = 0; i < *yextent; i++) + { + size_t xabs, yabs, zabs; + calc_absolute_coordinates(*xstart, *ystart + i, *zstart, &xabs, &yabs, &zabs); list_cell_y_coord[i] = yabs; - }; + }; - for (size_t i =0; i<*zextent; i++){ - size_t xabs,yabs,zabs; - calc_absolute_coordinates(*xstart,*ystart,*zstart+i,&xabs,&yabs,&zabs); + for (size_t i = 0; i < *zextent; i++) + { + size_t xabs, yabs, zabs; + calc_absolute_coordinates(*xstart, *ystart, *zstart + i, &xabs, &yabs, &zabs); list_cell_z_coord[i] = zabs; - }; + }; + }; + //================================================================ + // Generate 1-D binary trees for each of the x,y,z cuboid dimensions + //================================================================ + { + int error_code; -}; -//================================================================ -// Generate 1-D binary trees for each of the x,y,z cuboid dimensions -//================================================================ -{ -int error_code; + if (error_code = return_binary_tree_cell_lists(level_max, list_cell_x_coord, + *xextent, ret_x_list_coords, nreturn_x, child_pointer_x, + level_count_x, level_begin_x, index_perm_x)) + return (error_code); + if (error_code = return_binary_tree_cell_lists(level_max, list_cell_y_coord, + *yextent, ret_y_list_coords, nreturn_y, child_pointer_y, + level_count_y, level_begin_y, index_perm_y)) + return (error_code); + if (error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, + *zextent, ret_z_list_coords, nreturn_z, child_pointer_z, + level_count_z, level_begin_z, index_perm_z)) + return (error_code); + }; + //=================================================================== + // Allocate memory to store all the cell properties + //=================================================================== + { + size_t number_of_cells = 0; + for (int i = level_max; i >= 0; i--) + { + cumulative_cell_index[i] = number_of_cells; + number_of_cells += level_count_x[i] * level_count_y[i] * level_count_z[i]; + }; -if (error_code = return_binary_tree_cell_lists(level_max, list_cell_x_coord, - *xextent, ret_x_list_coords, nreturn_x, child_pointer_x, - level_count_x, level_begin_x, index_perm_x)) return(error_code); -if (error_code = return_binary_tree_cell_lists(level_max, list_cell_y_coord, - *yextent, ret_y_list_coords, nreturn_y, child_pointer_y, - level_count_y, level_begin_y, index_perm_y)) return(error_code); -if (error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, - *zextent, ret_z_list_coords, nreturn_z, child_pointer_z, - level_count_z, level_begin_z, index_perm_z)) return(error_code); + if (*verbose) + printf("Total number cells: %llu \n", number_of_cells); -}; - //=================================================================== - // Allocate memory to store all the cell properties - //=================================================================== - { - size_t number_of_cells = 0; + cell_memory_to_allocate = sizeof(PAN_REAL) * number_of_cells * Nbasis; + }; - for(int i=level_max; i>=0; i--) { - cumulative_cell_index[i] = number_of_cells; - number_of_cells += level_count_x[i]*level_count_y[i]*level_count_z[i]; - }; - - - if (*verbose) printf("Total number cells: %llu \n",number_of_cells); + PAN_REAL *working_space = malloc(cell_memory_to_allocate); + if (working_space == NULL) + return (210); - cell_memory_to_allocate = sizeof(PAN_REAL) * number_of_cells * Nbasis; - }; + //======================================================================================== + // Loop over octree starting at the root, for all relevant cells at each level + //======================================================================================== + size_t total_number_cells = 0; + size_t num_cell_compute = 0; + size_t num_level_max_cells = 0; + size_t total_num_children = 0; + { + size_t cell_index, j1, j2, j3; + size_t child_cells[8]; + size_t xoffset, yoffset, zoffset; + size_t ix, iy, iz; + size_t xco, yco, zco; + size_t child_index, work_index, selected_child_index; + size_t i; - PAN_REAL *working_space = malloc(cell_memory_to_allocate); - if (working_space==NULL) return (210); - - //======================================================================================== - // Loop over octree starting at the root, for all relevant cells at each level - //======================================================================================== - size_t total_number_cells=0; - size_t num_cell_compute=0; - size_t num_level_max_cells=0; - size_t total_num_children=0; -{ - size_t cell_index,j1,j2,j3; - size_t child_cells[8]; - size_t xoffset,yoffset,zoffset; - size_t ix,iy,iz; - size_t xco,yco,zco; - size_t child_index,work_index,selected_child_index; - size_t i; - - - - PAN_REAL parent[Nbasis]; - PAN_REAL children[8*Nbasis]; - - if (level_max==0) return_root_legendre_coefficients_(working_space); // Return root cell coefficients + PAN_REAL parent[Nbasis]; + PAN_REAL children[8 * Nbasis]; + if (level_max == 0) + return_root_legendre_coefficients_(working_space); // Return root cell coefficients #ifdef USE_OPENMP - double start, end; - start = omp_get_wtime(); - if (*verbose) printf("Start ...\n"); + double start, end; + start = omp_get_wtime(); + if (*verbose) + printf("Start ...\n"); #endif - - - for (size_t level=0; level < level_max; level++){ + for (size_t level = 0; level < level_max; level++) + { #ifdef USE_OPENMP -#pragma omp parallel for collapse(3) \ - private (cell_index,xoffset,yoffset,zoffset,j1,j2,j3,ix,iy,iz, \ - xco,yco,zco,child_index,work_index,selected_child_index,i, \ - parent,children) +#pragma omp parallel for collapse(3) private(cell_index, xoffset, yoffset, zoffset, j1, j2, j3, ix, iy, iz, \ + xco, yco, zco, child_index, work_index, selected_child_index, i, \ + parent, children) #endif - for (int cell_x=0; cell_x < level_count_x[level]; cell_x++) - for (int cell_y=0; cell_y < level_count_y[level]; cell_y++) - for (int cell_z = 0; cell_z < level_count_z[level]; cell_z++){ + for (int cell_x = 0; cell_x < level_count_x[level]; cell_x++) + for (int cell_y = 0; cell_y < level_count_y[level]; cell_y++) + for (int cell_z = 0; cell_z < level_count_z[level]; cell_z++) + { - cell_index = cumulative_cell_index[level] + cell_x*level_count_y[level]*level_count_z[level]+ - cell_y*level_count_z[level] + cell_z; + cell_index = cumulative_cell_index[level] + cell_x * level_count_y[level] * level_count_z[level] + + cell_y * level_count_z[level] + cell_z; - xoffset = level_begin_x[level] + cell_x; - yoffset = level_begin_y[level] + cell_y; - zoffset = level_begin_z[level] + cell_z; + xoffset = level_begin_x[level] + cell_x; + yoffset = level_begin_y[level] + cell_y; + zoffset = level_begin_z[level] + cell_z; - j1 = ret_x_list_coords[xoffset]; - j2 = ret_y_list_coords[yoffset]; - j3 = ret_z_list_coords[zoffset]; + j1 = ret_x_list_coords[xoffset]; + j2 = ret_y_list_coords[yoffset]; + j3 = ret_z_list_coords[zoffset]; - if (level==0){ return_root_legendre_coefficients_(parent); // Root cell parent information - }else{ - for(i=0; i 1) + printf("Cell: L%llu %llu %llu %llu\n", level, j1, j2, j3); + + }; // z/y/x-coordinate/level + + // if (flag_nochildren!=0) return(211); //All cells should have at least one child + }; // End loop over levels + +#ifdef USE_OPENMP + + end = omp_get_wtime(); + + if (*verbose) + printf("End ...\n"); + + double cpu_time_used = ((double)(end - start)); + + if (*verbose) + printf("Time in OMP Section = %lf seconds \n", cpu_time_used); + +#endif + }; + + //======================================================================================== + // Assign data from work_space to the input array + //======================================================================================== + { + + FFTW_REAL *ptr_real = output_values; + FFTW_COMPLEX *ptr_cmplx = output_values; + size_t zdimension = (*flag_output_mode == 2) ? *zextent + 2 : *zextent; // For R2C pad by two in z-dimension + + //printf("zdimension = %ld\n",zdimension); + + // PAN_COMPLEX *ptr_cplx; + // *ptr_real =(* PAN_REAL) *output_values; + // *ptr_cplx = output_values + + for (size_t xco = 0; xco < *xextent; xco++) + for (size_t yco = 0; yco < *yextent; yco++) + for (size_t zco = 0; zco < *zextent; zco++) + { + size_t xloc = index_perm_x[xco], yloc = index_perm_y[yco], zloc = index_perm_z[zco]; + + size_t index = Nbasis * (xco * (*yextent) * (*zextent) + yco * (*zextent) + zco); + size_t out_v_index = *ncopy * (xloc * (*yextent) * zdimension + yloc * zdimension + zloc); + + if (*flag_output_mode == 1) + { + + for (size_t i = 0; i < *ncopy; i++) + ptr_cmplx[out_v_index + i] = (FFTW_COMPLEX)working_space[index + copy_list[i]]; + } + else + { + + for (size_t i = 0; i < *ncopy; i++) + ptr_real[out_v_index + i] = working_space[index + copy_list[i]]; + }; }; + }; - //=================================================================================================== - compute_all_properties_of_a_panphasia_cell_(&level,&j1,&j2,&j3,parent,children); - //=================================================================================================== - - // Determine which child information needs to be stored - - - for (ix=0; ix<2; ix++) for (iy=0; iy<2; iy++) for (iz=0; iz<2; iz++){ - - if ((child_pointer_x[2*xoffset+ix]!=-1)&&(child_pointer_y[2*yoffset+iy]!=-1) - &&(child_pointer_z[2*zoffset+iz]!=-1)){ - - - xco = child_pointer_x[2*xoffset+ix] - level_begin_x[level+1]; - yco = child_pointer_y[2*yoffset+iy] - level_begin_y[level+1]; - zco = child_pointer_z[2*zoffset+iz] - level_begin_z[level+1]; - - - child_index = cumulative_cell_index[level+1] + xco*level_count_y[level+1]*level_count_z[level+1] + - yco*level_count_z[level+1] + zco; - - - work_index = Nbasis*child_index; - selected_child_index = Nbasis*(4*ix + 2*iy + iz); - for (i=0; i1) printf("Cell: L%llu %llu %llu %llu\n",level,j1,j2,j3); - - - }; // z/y/x-coordinate/level - - - // if (flag_nochildren!=0) return(211); //All cells should have at least one child -}; // End loop over levels - - -#ifdef USE_OPENMP - - end = omp_get_wtime(); - - if (*verbose) printf("End ...\n"); - - - double cpu_time_used = ((double) (end - start)); - - - if (*verbose) printf("Time in OMP Section = %lf seconds \n",cpu_time_used); - -#endif - - -}; - -//======================================================================================== -// Assign data from work_space to the input array -//======================================================================================== -{ - - - - FFTW_REAL *ptr_real = output_values; - FFTW_COMPLEX *ptr_cmplx = output_values; - size_t zdimension = (*flag_output_mode==2) ? *zextent + 2 : *zextent; // For R2C pad by two in z-dimension - - - //printf("zdimension = %ld\n",zdimension); - - // PAN_COMPLEX *ptr_cplx; - // *ptr_real =(* PAN_REAL) *output_values; - // *ptr_cplx = output_values - - for (size_t xco=0;xco<*xextent;xco++)for(size_t yco=0;yco<*yextent;yco++)for(size_t zco=0; zco<*zextent;zco++){ - size_t xloc = index_perm_x[xco], yloc = index_perm_y[yco], zloc = index_perm_z[zco]; - - size_t index = Nbasis*( xco*(*yextent)*(*zextent) + yco*(*zextent) + zco); - size_t out_v_index = *ncopy*(xloc*(*yextent)*zdimension + yloc*zdimension + zloc); - - - if (*flag_output_mode==1){ - - for (size_t i=0; i<*ncopy; i++) ptr_cmplx[out_v_index+i] = (FFTW_COMPLEX) working_space[index+copy_list[i]]; - - }else{ - - for (size_t i=0; i<*ncopy; i++) ptr_real[out_v_index+i] = working_space[index+copy_list[i]]; - - - }; - }; - - - }; - - -//===========================================(============================================== -// Free all memory (in order of calls to malloc above) -//========================================================================================= + //===========================================(============================================== + // Free all memory (in order of calls to malloc above) + //========================================================================================= free(ret_x_list_coords); free(ret_y_list_coords); free(ret_z_list_coords); - + free(child_pointer_x); free(child_pointer_y); free(child_pointer_z); @@ -1207,294 +1209,288 @@ if (error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, free(index_perm_y); free(index_perm_z); - free(list_cell_x_coord); free(list_cell_y_coord); free(list_cell_z_coord); - free(working_space); //tic_tot = getticks()-tic_start; - - - // if (*verbose) printf("Total child cells at deepest level %llu \n",num_level_max_cells); - //if (*verbose) printf("Total number of cells computed %llu \n",num_cell_compute); - //if (*verbose) printf("Total number of child cells %llu \n",total_num_children); + // if (*verbose) printf("Total child cells at deepest level %llu \n",num_level_max_cells); + //if (*verbose) printf("Total number of cells computed %llu \n",num_cell_compute); + //if (*verbose) printf("Total number of child cells %llu \n",total_num_children); //if (*verbose) printf("Time to compute %llu cells at level %llu: %.3f %s \n",num_level_max_cells, - // level_max, clocks_from_ticks(tic_tot), clocks_getunit()); + // level_max, clocks_from_ticks(tic_tot), clocks_getunit()); -//======================================================================================= - return(0); -//======================================================================================= -}; + //======================================================================================= + return (0); + //======================================================================================= +} +//====================================================================================== //====================================================================================== //====================================================================================== //====================================================================================== //====================================================================================== -//====================================================================================== +int parse_and_validate_descriptor_(char *descriptor) +{ + char *token; + const char split[20] = "[,()]"; + char copy[300]; + size_t desc_order, desc_level, desc_x, desc_y, desc_z, desc_size; + char desc_name[100]; + size_t desc_kk_limit = 0; + long long int desc_ch, comp_ch; + int kk_limit_set = 0; + int nelement = 0; + char descriptor_as_read[300]; + strcpy(copy, descriptor); + token = strtok(copy, split); -int parse_and_validate_descriptor_(char *descriptor){ - - - char *token; - const char split[20] = "[,()]"; - char copy[300]; - size_t desc_order, desc_level, desc_x, desc_y, desc_z,desc_size; - char desc_name[100]; - size_t desc_kk_limit = 0; - long long int desc_ch,comp_ch; - int kk_limit_set = 0; - int nelement = 0; - char descriptor_as_read[300]; - - strcpy(copy,descriptor); - - - token = strtok(copy, split); - - - while( token != NULL ) { + while (token != NULL) + { nelement++; - // Read in compulsory elements + // Read in compulsory elements - switch(nelement){ + switch (nelement) + { case 1: - if (sscanf(token,"Panph%llu",&desc_order)!=1) return (440001); - break; + if (sscanf(token, "Panph%llu", &desc_order) != 1) + return (440001); + break; case 2: - if (sscanf(token,"L%llu",&desc_level)!=1) return 440002; - break; + if (sscanf(token, "L%llu", &desc_level) != 1) + return 440002; + break; case 3: - if (sscanf(token,"%llu",&desc_x)!=1) return 440003; - break; + if (sscanf(token, "%llu", &desc_x) != 1) + return 440003; + break; case 4: - if (sscanf(token,"%llu",&desc_y)!=1) return 440004; - break; + if (sscanf(token, "%llu", &desc_y) != 1) + return 440004; + break; case 5: - if (sscanf(token,"%llu",&desc_z)!=1) return 440005; - break; + if (sscanf(token, "%llu", &desc_z) != 1) + return 440005; + break; case 6: - if (sscanf(token,"S%llu",&desc_size)!=1) return 440005; - break; + if (sscanf(token, "S%llu", &desc_size) != 1) + return 440005; + break; case 7: - if (sscanf(token,"KK%lld",&desc_kk_limit)==1) { - kk_limit_set=1; - token = strtok(NULL, split); - }; - if (sscanf(token,"CH%lld",&desc_ch)!=1) return 440006; + if (sscanf(token, "KK%lld", &desc_kk_limit) == 1) + { + kk_limit_set = 1; + token = strtok(NULL, split); + } + if (sscanf(token, "CH%lld", &desc_ch) != 1) + return 440006; break; case 8: - if (sscanf(token,"%s",&desc_name)!=1) return 440007; - break; - }; - token = strtok(NULL, split); - }; + if (sscanf(token, "%s", &desc_name) != 1) + return 440007; + break; + } + token = strtok(NULL, split); + } + if (kk_limit_set == 0) + { + sprintf(descriptor_as_read, "[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", + desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_ch, desc_name); + } + else + { + sprintf(descriptor_as_read, "[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", + desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_kk_limit, desc_ch, desc_name); + } - if (kk_limit_set==0){ - sprintf(descriptor_as_read,"[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", - desc_order,desc_level,desc_x,desc_y,desc_z,desc_size,desc_ch,desc_name); - } else{ - sprintf(descriptor_as_read,"[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", - desc_order,desc_level,desc_x,desc_y,desc_z,desc_size,desc_kk_limit,desc_ch,desc_name); + if (strcmp(descriptor, descriptor_as_read)) + { + printf("Error - descriptor mismatch\n"); + printf("As read in: %s\n", descriptor_as_read); + printf(" %s\n", descriptor); + } - }; + // Valid format descriptor has been passed - store values - if (strcmp(descriptor,descriptor_as_read)){ - printf("Error - descriptor mismatch\n"); - printf("As read in: %s\n",descriptor_as_read); - printf(" %s\n",descriptor); - }; + descriptor_order = desc_order; + descriptor_base_level = desc_level; + descriptor_xorigin = desc_x; + descriptor_yorigin = desc_y; + descriptor_zorigin = desc_z; + descriptor_base_size = desc_size; + descriptor_kk_limit = desc_kk_limit; + descriptor_check_digit = desc_ch; + strcpy(descriptor_name, desc_name); + strcpy(full_descriptor, descriptor); + descriptor_read_in = 1; + comp_ch = compute_check_digit_(); // check the check digit - // Valid format descriptor has been passed - store values + if ((desc_ch != -999) && (desc_ch != comp_ch)) + { + descriptor_read_in = 0; + printf("Check digit read in %llu\n Check digit expected %llu\n", desc_ch, comp_ch); + return (44008); + } + return (0); +} +void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel, size_t *xabs, size_t *yabs, size_t *zabs) +{ - descriptor_order = desc_order; - descriptor_base_level = desc_level; - descriptor_xorigin = desc_x; - descriptor_yorigin = desc_y; - descriptor_zorigin = desc_z; - descriptor_base_size = desc_size; - descriptor_kk_limit = desc_kk_limit; - descriptor_check_digit = desc_ch; - strcpy(descriptor_name, desc_name); - strcpy(full_descriptor,descriptor); - descriptor_read_in = 1; + *xabs = ((descriptor_xorigin << rel_level) + ((rel_origin_x + xrel) % rel_coord_max)) % ((size_t)1 << (descriptor_base_level + rel_level)); - comp_ch = compute_check_digit_(); // check the check digit + *yabs = ((descriptor_yorigin << rel_level) + ((rel_origin_y + yrel) % rel_coord_max)) % ((size_t)1 << (descriptor_base_level + rel_level)); - if ((desc_ch!=-999)&&(desc_ch!=comp_ch)){ - descriptor_read_in = 0; - printf("Check digit read in %llu\n Check digit expected %llu\n",desc_ch,comp_ch); - return (44008); - }; - - - return(0); - -}; - - - -void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs){ - - *xabs = ((descriptor_xorigin<=cumulative_cell_index[descriptor_base_level+rel_level+1]) return(301); + if (cell_id >= cumulative_cell_index[descriptor_base_level + rel_level + 1]) + return (301); size_t cell_level; - for (cell_level = descriptor_base_level+rel_level; - cell_id < cumulative_cell_index[cell_level];cell_level--); - + for (cell_level = descriptor_base_level + rel_level; + cell_id < cumulative_cell_index[cell_level]; cell_level--) + ; + size_t local_id = cell_id - cumulative_cell_index[cell_level]; - *cell_x = local_id/(cuboid_y_dimen[cell_level]*cuboid_z_dimen[cell_level]); - *cell_y = (local_id - *cell_x*cuboid_y_dimen[cell_level]*cuboid_z_dimen[cell_level])/cuboid_z_dimen[cell_level]; - *cell_z = local_id%cuboid_z_dimen[cell_level]; + *cell_x = local_id / (cuboid_y_dimen[cell_level] * cuboid_z_dimen[cell_level]); + *cell_y = (local_id - *cell_x * cuboid_y_dimen[cell_level] * cuboid_z_dimen[cell_level]) / cuboid_z_dimen[cell_level]; + *cell_z = local_id % cuboid_z_dimen[cell_level]; //printf("Cell level %llu x %llu y %llu z %llu\n",cell_level,*cell_x,*cell_y,*cell_z); - + return (0); +} - return(0); +int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates, + size_t extent, size_t *return_tree_list_coordinates, size_t nreturn, + long long int *child_pointer, size_t *level_count, + size_t *level_begin, size_t *index_perm) +{ -}; + if (extent == 0) + return (401); + if (nreturn < 2 * extent + 192) + return (402); + for (size_t i = 0; i < 2 * nreturn; i++) + child_pointer[i] = -1; - - -int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates, - size_t extent, size_t *return_tree_list_coordinates, size_t nreturn, - long long int *child_pointer, size_t *level_count, - size_t *level_begin, size_t *index_perm){ - -if (extent==0) return(401); -if (nreturn<2*extent+192) return(402); - -for (size_t i=0; i<2*nreturn;i++) child_pointer[i]=-1; - -{ size_t stride=1; - for(size_t i=0; i0; level--){ - - offset =level_begin[level]+level_count[level]; - counter = 0; - - abs_coord = return_tree_list_coordinates[level_begin[level]]; - - return_tree_list_coordinates[offset] = abs_coord>>1; - child_pointer[2*offset + abs_coord%2] = level_begin[level]; - - for(size_t cell = 1; cell>1 == return_tree_list_coordinates[offset+counter]){ - child_pointer[2*offset + 2*counter + abs_coord%2] = level_begin[level]+cell; - }else{ - counter++; - return_tree_list_coordinates[offset+counter] = abs_coord>>1; - child_pointer[2*offset + 2*counter + abs_coord%2] = level_begin[level]+cell; + { + size_t stride = 1; + for (size_t i = 0; i < extent; i++) + { + index_perm[i] = i; + return_tree_list_coordinates[i] = list_cell_coordinates[i]; }; + gsl_sort2_ulong(return_tree_list_coordinates, stride, index_perm, stride, extent); + } - }; //cell loop + //---------------------------------------------------------------------------- + level_begin[level_max] = 0; + level_count[level_max] = extent; + size_t offset, counter; + size_t abs_coord; - level_count[level-1] = ++counter; - level_begin[level-1] = level_begin[level] + level_count[level]; - }; // level loop + for (size_t level = level_max; level > 0; level--) + { -return(0); + offset = level_begin[level] + level_count[level]; + counter = 0; -}; -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -// -// Test code for checking the appropriate moments are preserved -// between levels in Panphasia -// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////// + abs_coord = return_tree_list_coordinates[level_begin[level]]; -#include + return_tree_list_coordinates[offset] = abs_coord >> 1; + child_pointer[2 * offset + abs_coord % 2] = level_begin[level]; + for (size_t cell = 1; cell < level_count[level]; cell++) + { + abs_coord = return_tree_list_coordinates[level_begin[level] + cell]; -void integrate_cell( int, int, int, size_t, size_t, size_t, FFTW_REAL * , double *); + if (abs_coord >> 1 == return_tree_list_coordinates[offset + counter]) + { + child_pointer[2 * offset + 2 * counter + abs_coord % 2] = level_begin[level] + cell; + } + else + { + counter++; + return_tree_list_coordinates[offset + counter] = abs_coord >> 1; + child_pointer[2 * offset + 2 * counter + abs_coord % 2] = level_begin[level] + cell; + } + + } //cell loop + + level_count[level - 1] = ++counter; + level_begin[level - 1] = level_begin[level] + level_count[level]; + }; // level loop + + return (0); +} + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + // + // Test code for checking the appropriate moments are preserved + // between levels in Panphasia + // + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////// + +#include + +void integrate_cell(int, int, int, size_t, size_t, size_t, FFTW_REAL *, double *); int compute_panphasia_(double, double, double, size_t, size_t, size_t, FFTW_REAL *, double *); -void test_cell_moments(char *,size_t, size_t, size_t, size_t, size_t, double *); +void test_cell_moments(char *, size_t, size_t, size_t, size_t, size_t, double *); //////////////////////////////////////////////////////////////////////////////// -void test_moments_(){ - +void test_moments_() +{ int lev = 10; - char descriptor_demo[300]="Hello!"; - printf("Demo string %s\n",descriptor_demo); + char descriptor_demo[300] = "Hello!"; + printf("Demo string %s\n", descriptor_demo); // descriptor_pair_generate_();//, descriptor_demo); - printf("Parameters: %s\n",descriptor_demo); + printf("Parameters: %s\n", descriptor_demo); - - - - size_t nlevel=1; + size_t nlevel = 1; double coefficients1[Nbasis]; double coefficients2[Nbasis]; - double max_diff2=0.0; - double rms_diff2=0.0; - + double max_diff2 = 0.0; + double rms_diff2 = 0.0; char descriptor[200]; @@ -1502,103 +1498,111 @@ void test_moments_(){ size_t const yco_full = 0x67ea73c992a3355c; size_t const zco_full = 0x5ab50a5892e98768; - size_t xco = 0; size_t yco = 0; size_t zco=0; + size_t xco = 0; + size_t yco = 0; + size_t zco = 0; - verbose_warnings_only=1; // Minimize output to screen. + verbose_warnings_only = 1; // Minimize output to screen. - for (size_t level=0; level<63; level++){ + for (size_t level = 0; level < 63; level++) + { - xco = (xco_full)>>(63-level); - yco = (yco_full)>>(63-level); - zco = (zco_full)>>(63-level); + xco = (xco_full) >> (63 - level); + yco = (yco_full) >> (63 - level); + zco = (zco_full) >> (63 - level); - sprintf(descriptor,"[Panph6,L%ld,(%llu,%llu,%llu),S1,CH-999,test]",level,xco,yco,zco); - // printf("%s\n",descriptor); + sprintf(descriptor, "[Panph6,L%ld,(%llu,%llu,%llu),S1,CH-999,test]", level, xco, yco, zco); + // printf("%s\n",descriptor); - test_cell_moments(descriptor,0,0,0,0,1,coefficients1); + test_cell_moments(descriptor, 0, 0, 0, 0, 1, coefficients1); - test_cell_moments(descriptor,1,0,0,0,2,coefficients2); + test_cell_moments(descriptor, 1, 0, 0, 0, 2, coefficients2); - for (int i=0; imax_diff2) max_diff2=diff2; - rms_diff2+=diff2; - }; + for (int i = 0; i < Nbasis; i++) + { + double diff2 = pow(coefficients2[i] - coefficients1[i], 2); + if (diff2 > max_diff2) + max_diff2 = diff2; + rms_diff2 += diff2; + } - rms_diff2/=(double)Nbasis; + rms_diff2 /= (double)Nbasis; - // for (int i=0; i 1.e-12) || (rms_diff2 > 1.e-12)) + { + printf("Moments not accurately recovered at single precision\n"); + abort(); + } + } + else + { - if ((max_diff2>1.e-12)||(rms_diff2>1.e-12)){ - printf("Moments not accurately recovered at single precision\n"); abort(); - }; + if ((max_diff2 > 1.e-24) || (rms_diff2 > 1.e-24)) + { + printf("Moments not accurately recovered at double precision\n"); + abort(); + } + } - }else{ - - if ((max_diff2>1.e-24)||(rms_diff2>1.e-24)){ - printf("Moments not accurately recovered at double precision\n"); abort(); - }; - - }; - - //printf("Acceptable differences: %e RMS difference %e\n",sqrt(max_diff2),sqrt(rms_diff2)); - -}; + //printf("Acceptable differences: %e RMS difference %e\n",sqrt(max_diff2),sqrt(rms_diff2)); + } printf("Completed moment test successfully.\n"); - - -}; +} void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_orig_x, - size_t rel_orig_y, size_t rel_orig_z, size_t extent, double *coeff ){ + size_t rel_orig_y, size_t rel_orig_z, size_t extent, double *coeff) +{ -int error_code; -int verbose = 0; - int flag_output_mode=0; + int error_code; + int verbose = 0; + int flag_output_mode = 0; -PANPHASIA_init_descriptor_(root_descriptor,&verbose); + PANPHASIA_init_descriptor_(root_descriptor, &verbose); + verbose = 0; - verbose = 0; - + if (error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose)) + { + printf("Error %d in initialing PANPHASIA_init_level_\n", + error_code); + } - if (error_code = PANPHASIA_init_level_(&rel_lev, - &rel_orig_x,&rel_orig_y,&rel_orig_z,&verbose)){ - printf("Error %d in initialing PANPHASIA_init_level_\n", - error_code); - }; + size_t xstart = 0, ystart = 0, zstart = 0; - size_t xstart = 0, ystart = 0, zstart = 0; - - size_t xextent, yextent, zextent; + size_t xextent, yextent, zextent; - xextent = extent; yextent=extent; zextent=extent; - size_t copy_list[Nbasis]; - for (int i=0; i= 10) + { + printf("Higher order Gaussian Quadrature needed!\n"); + abort(); + } - if (p_order>=10){printf("Higher order Gaussian Quadrature needed!\n");abort();}; + double a = 0.0; + double b = 1.0; - double a = 0.0; - double b = 1.0; + double middle = 0.5 * (b + a); + double range = 0.5 * (b - a); - double middle = 0.5*(b+a); - double range = 0.5*(b-a); + double sum[Nbasis]; + for( size_t i=0; i=(double)xextent)) return (1); - if ((y<0)||(y>=(double)yextent)) return (1); - if ((z<0)||(z>=(double)zextent)) return (1); + if ((x < 0) || (x >= (double)xextent)) + return (1); + if ((y < 0) || (y >= (double)yextent)) + return (1); + if ((z < 0) || (z >= (double)zextent)) + return (1); int ix = (int)x; int iy = (int)y; int iz = (int)z; - double up = 2.0*(x-ix)-1.0; - double vp = 2.0*(y-iy)-1.0; - double wp = 2.0*(z-iz)-1.0; + double up = 2.0 * (x - ix) - 1.0; + double vp = 2.0 * (y - iy) - 1.0; + double wp = 2.0 * (z - iz) - 1.0; - double lgp_up[p_order+1]; - double lgp_vp[p_order+1]; - double lgp_wp[p_order+1]; + double lgp_up[p_order + 1]; + double lgp_vp[p_order + 1]; + double lgp_wp[p_order + 1]; int p = p_order; - gsl_sf_legendre_Pl_array(p,up,lgp_up); - gsl_sf_legendre_Pl_array(p,vp,lgp_vp); - gsl_sf_legendre_Pl_array(p,wp,lgp_wp); + gsl_sf_legendre_Pl_array(p, up, lgp_up); + gsl_sf_legendre_Pl_array(p, vp, lgp_vp); + gsl_sf_legendre_Pl_array(p, wp, lgp_wp); - for (int i=0; i #include - - #include "PAN_FFTW3.h" #include "panphasia_functions.h" @@ -19,79 +17,114 @@ int threads_ok; int number_omp_threads = 1; #endif +// does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC) +int PANPHASIA_HO_main(void) +{ + int verbose = 0; + int error; + size_t x0 = 0, y0 = 0, z0 = 0; + size_t rel_level; + char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + PANPHASIA_init_descriptor_(descriptor, &verbose); + rel_level = 6; //Set size of test dataset + + if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)) + { + printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); + abort(); + }; + + //======================= FFTW ============================== + + ptrdiff_t alloc_local, local_n0, local_0_start; + + ptrdiff_t N0 = descriptor_base_size << rel_level; + + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0, MPI_COMM_WORLD, &local_n0, &local_0_start); + + FFTW_COMPLEX *Panphasia_White_Noise_Field; + + Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); + + if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field)) + { + printf("Error code from PANPHASIA_compute ... %d\n", error); + }; + + fftw_free(Panphasia_White_Noise_Field); +} + +#ifdef STANDALONE_PANPHASIA_HO int main(int argc, char **argv) { -int verbose=0; -int error; -size_t x0=0, y0=0, z0=0; -size_t rel_level; -char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + int verbose = 0; + int error; + size_t x0 = 0, y0 = 0, z0 = 0; + size_t rel_level; + char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; #ifdef USE_OPENMP - omp_set_num_threads(number_omp_threads); - int provided; - MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); - threads_ok = provided >= MPI_THREAD_FUNNELED; - if (threads_ok) threads_ok = fftw_init_threads(); - fftw_mpi_init(); - int num_threads = number_omp_threads ; - if (threads_ok){ + omp_set_num_threads(number_omp_threads); + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + threads_ok = provided >= MPI_THREAD_FUNNELED; + if (threads_ok) + threads_ok = fftw_init_threads(); + fftw_mpi_init(); + int num_threads = number_omp_threads; + if (threads_ok) + { fftw_plan_with_nthreads(num_threads); - }else{ + } + else + { printf("Failure to initialise threads ...\n"); - MPI_Finalize(); - }; - - printf("OpenMP threads enabled with FFTW. Number of threads %d\n",fftw_planner_nthreads()); + MPI_Finalize(); + }; + + printf("OpenMP threads enabled with FFTW. Number of threads %d\n", fftw_planner_nthreads()); #else - MPI_Init(&argc, &argv); + MPI_Init(&argc, &argv); #endif - PANPHASIA_init_descriptor_(descriptor,&verbose); + PANPHASIA_init_descriptor_(descriptor, &verbose); + rel_level = 6; //Set size of test dataset - rel_level = 6; //Set size of test dataset + if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)) + { + printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); + abort(); + }; + //======================= FFTW ============================== -if (error=PANPHASIA_init_level_(&rel_level,&x0,&y0,&z0,&verbose)){ - printf("Abort: PANPHASIA_init_level_ :error code %d\n",error); - abort(); -}; + fftw_mpi_init(); -//======================= FFTW ============================== + ptrdiff_t alloc_local, local_n0, local_0_start; -fftw_mpi_init(); + ptrdiff_t N0 = descriptor_base_size << rel_level; -ptrdiff_t alloc_local, local_n0, local_0_start; + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0, MPI_COMM_WORLD, &local_n0, &local_0_start); -ptrdiff_t N0 = descriptor_base_size<. +// +// IMPORTANT NOTICE: +// Note that PANPHASIA itself is not released under the GPL. Make sure +// to read and agree to its distinct licensing before you use or modify +// the code below or in the /external/panphasia directory which can be +// found here: http://icc.dur.ac.uk/Panphasia.php +// NOTE THAT PANPHASIA REQUIRES REGISTRATION ON THIS WEBSITE PRIOR TO USE + +#if defined(USE_PANPHASIA_HO) + +#include +#include +#include + +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#include + +extern "C"{ + int PANPHASIA_HO_main( void ); +} + + +class RNG_panphasia_ho : public RNG_plugin +{ +private: +protected: + std::string descriptor_string_; + int num_threads_; + + +public: + explicit RNG_panphasia_ho(config_file &cf) : RNG_plugin(cf) + { + descriptor_string_ = pcf_->get_value("random", "descriptor"); + +#ifdef _OPENMP + num_threads_ = omp_get_max_threads(); +#else + num_threads_ = 1; +#endif + + PANPHASIA_HO_main(); + } + + ~RNG_panphasia_ho() { } + + bool isMultiscale() const { return true; } + + void Fill_Grid(Grid_FFT &g) + { + + } +}; + +namespace +{ + RNG_plugin_creator_concrete creator("PANPHASIA_HO"); +} +#endif // defined(USE_PANPHASIA_HO) \ No newline at end of file From 8137262ea7fa3e00a5a2a9ca378f6b24af543346 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Tue, 4 May 2021 22:14:10 +0200 Subject: [PATCH 03/25] bugfixes for RAMSES output with MPI --- src/ic_generator.cc | 4 ++ src/plugins/output_grafic2.cc | 87 +++++++++++++++++++---------------- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/src/ic_generator.cc b/src/ic_generator.cc index 5fc59e5..8fd81c0 100644 --- a/src/ic_generator.cc +++ b/src/ic_generator.cc @@ -591,6 +591,10 @@ int run( config_file& the_config ) } } } + #if defined(USE_MPI) + real_t local_maxdphi = maxdphi; + MPI_Allreduce( &local_maxdphi, &maxdphi, 1, MPI::get_datatype(), MPI_MAX, MPI_COMM_WORLD ); + #endif const real_t hbar_safefac = 1.01; const real_t hbar = maxdphi / M_PI / Dplus0 * hbar_safefac; music::ilog << "Semiclassical PT : hbar = " << hbar << " (limited by initial potential, safety=" << hbar_safefac << ")." << std::endl; diff --git a/src/plugins/output_grafic2.cc b/src/plugins/output_grafic2.cc index de0a603..03b2841 100644 --- a/src/plugins/output_grafic2.cc +++ b/src/plugins/output_grafic2.cc @@ -51,7 +51,7 @@ protected: real_t lunit_, vunit_, munit_, omegab_; uint32_t levelmin_; bool bhavebaryons_; - std::vector data_buf_; + std::vector data_buf_, data_buf_write_; std::string dirname_; bool bUseSPT_; @@ -207,52 +207,59 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT &g, const cos std::string file_name = this->get_file_name(s, c); // serialize parallel write - for (int write_rank = 0; write_rank < CONFIG::MPI_task_size; ++write_rank) + if (CONFIG::MPI_task_rank == 0) { - if (write_rank == CONFIG::MPI_task_rank) + unlink(file_name.c_str()); + } + + std::ofstream *pofs; + + // write header or seek to end of file + if (CONFIG::MPI_task_rank == 0) + { + pofs = new std::ofstream(file_name.c_str(), std::ios::binary|std::ios::app); + uint32_t blocksz = sizeof(header); + pofs->write(reinterpret_cast(&blocksz), sizeof(int)); + pofs->write(reinterpret_cast(&header_), blocksz); + pofs->write(reinterpret_cast(&blocksz), sizeof(int)); + } + + // check field size against buffer size... + uint32_t ngrid = cf_.get_value("setup", "GridRes"); + assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid); + assert( g.size(1) == ngrid && g.size(2) == ngrid); + // write actual field slice by slice + // std::cerr << write_rank << ">" << g.size(0) << " " << g.size(1) << " " << g.size(2) << std::endl; + for (size_t i = 0; i < g.size(2); ++i) + { + data_buf_.assign(ngrid * ngrid, 0.0f); + + for (unsigned j = 0; j < g.size(1); ++j) { - if (write_rank == 0) + for (unsigned k = 0; k < g.size(0); ++k) { - unlink(file_name.c_str()); + data_buf_[j * ngrid + (k+g.local_0_start_)] = g.relem(k, j, i); } - std::ofstream ofs(file_name.c_str(), std::ios::binary|std::ios::app); - - // write header or seek to end of file - if (write_rank == 0) - { - uint32_t blocksz = sizeof(header); - ofs.write(reinterpret_cast(&blocksz), sizeof(int)); - ofs.write(reinterpret_cast(&header_), blocksz); - ofs.write(reinterpret_cast(&blocksz), sizeof(int)); - } - - // check field size against buffer size... - uint32_t ngrid = cf_.get_value("setup", "GridRes"); - assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid); - assert( g.size(1) == ngrid && g.size(2) == ngrid); - // write actual field slice by slice - for (size_t i = 0; i < g.size(2); ++i) - { - for (unsigned j = 0; j < g.size(1); ++j) - { - for (unsigned k = 0; k < g.size(0); ++k) - { - data_buf_[j * ngrid + k] = g.relem(k, j, i); - } - } - - uint32_t blocksz = ngrid * ngrid * sizeof(float); - ofs.write(reinterpret_cast(&blocksz), sizeof(uint32_t)); - ofs.write(reinterpret_cast(&data_buf_[0]), blocksz); - ofs.write(reinterpret_cast(&blocksz), sizeof(uint32_t)); - } - - ofs.close(); } +#if defined(USE_MPI) + if( CONFIG::MPI_task_rank == 0 ) data_buf_write_.assign(ngrid*ngrid,0.0f); + MPI_Reduce( &data_buf_[0], &data_buf_write_[0], ngrid*ngrid, MPI::get_datatype(), MPI_SUM, 0, MPI_COMM_WORLD ); + if( CONFIG::MPI_task_rank == 0 ) data_buf_.swap(data_buf_write_); +#endif - multitask_sync_barrier(); + if( CONFIG::MPI_task_rank == 0 ) + { + uint32_t blocksz = ngrid * ngrid * sizeof(float); + pofs->write(reinterpret_cast(&blocksz), sizeof(uint32_t)); + pofs->write(reinterpret_cast(&data_buf_[0]), blocksz); + pofs->write(reinterpret_cast(&blocksz), sizeof(uint32_t)); + } + } - } // end loop over write_rank + if( CONFIG::MPI_task_rank == 0 ){ + pofs->close(); + delete pofs; + } music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl; } From 2d5cc0ac502f77a9cbf0c5b2381805c0c7a982d6 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Fri, 7 May 2021 14:30:28 +0100 Subject: [PATCH 04/25] Updated code with new panphasia_ho versions the allow the Fourier grid to be a multiple of the grid of Panphasia cells. Fixed a few format statements. --- .../high_order_panphasia_routines.c | 55 +- external/panphasia_ho/main.c | 13 +- external/panphasia_ho/pan_mpi_routines.c | 469 ++++++++++++------ external/panphasia_ho/panphasia_functions.h | 4 +- 4 files changed, 370 insertions(+), 171 deletions(-) diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c index d1d369b..c2c471a 100644 --- a/external/panphasia_ho/high_order_panphasia_routines.c +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -621,10 +621,11 @@ int demo_descriptor_() char desc_name[100]; char desc_iden[8]; int error_code; + int pan_mode; descriptor_read_in = 0; - if (error_code = parse_and_validate_descriptor_(str)) + if (error_code = parse_and_validate_descriptor_(str,&pan_mode)) { printf("Invalid descriptor %s\n", str); @@ -756,11 +757,13 @@ int PANPHASIA_init_descriptor_(char *descriptor, int *verbose) set_panphasia_key_(verb); check_panphasia_key_(verb); - if (error = parse_and_validate_descriptor_(descriptor)) + int pan_mode; + if (error = parse_and_validate_descriptor_(descriptor,&pan_mode)) { printf("-----------------------------------------\n"); printf("Error initating start-up Panphasia routines \n"); printf("Error code %d\n", error); + printf("pan_mode %d\n", pan_mode); printf("-----------------------------------------\n"); abort(); }; @@ -907,12 +910,9 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta if (*zstart >= rel_coord_max) return (203); - if ((*xextent > rel_coord_max) || (*xextent == 0)) - return (204); - if ((*yextent > rel_coord_max) || (*yextent == 0)) - return (205); - if ((*zextent > rel_coord_max) || (*zextent == 0)) - return (206); + if (*xextent > rel_coord_max) return (204); + if (*yextent > rel_coord_max) return (205); + if (*zextent > rel_coord_max) return (206); if ((*ncopy < 0) || (*ncopy > Nbasis)) return (207); @@ -920,6 +920,8 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta if ((copy_list[0] < 0) || (copy_list[*ncopy - 1] >= Nbasis)) return (208); + if ((*xextent==0)||(*yextent==0)||(*zextent==0)) return(0); + for (int i = 1; i < *ncopy; i++) if (copy_list[i] <= copy_list[i - 1]) return (209); @@ -1160,8 +1162,8 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta //======================================================================================== { - FFTW_REAL *ptr_real = output_values; - FFTW_COMPLEX *ptr_cmplx = output_values; + PAN_REAL *ptr_real = output_values; + PAN_COMPLEX *ptr_cmplx = output_values; size_t zdimension = (*flag_output_mode == 2) ? *zextent + 2 : *zextent; // For R2C pad by two in z-dimension //printf("zdimension = %ld\n",zdimension); @@ -1183,7 +1185,7 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta { for (size_t i = 0; i < *ncopy; i++) - ptr_cmplx[out_v_index + i] = (FFTW_COMPLEX)working_space[index + copy_list[i]]; + ptr_cmplx[out_v_index + i] = (PAN_COMPLEX)working_space[index + copy_list[i]]; } else { @@ -1233,7 +1235,7 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta //====================================================================================== //====================================================================================== -int parse_and_validate_descriptor_(char *descriptor) +int parse_and_validate_descriptor_(char *descriptor, int *pan_mode) { char *token; @@ -1302,12 +1304,12 @@ int parse_and_validate_descriptor_(char *descriptor) if (kk_limit_set == 0) { - sprintf(descriptor_as_read, "[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", + sprintf(descriptor_as_read, "[Panph%llu,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_ch, desc_name); } else { - sprintf(descriptor_as_read, "[Panph%d,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", + sprintf(descriptor_as_read, "[Panph%llu,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_kk_limit, desc_ch, desc_name); } @@ -1332,6 +1334,8 @@ int parse_and_validate_descriptor_(char *descriptor) strcpy(full_descriptor, descriptor); descriptor_read_in = 1; + *pan_mode = (desc_order==1)? 0:1; // 0 - Old descriptor: 1 HO descriptor + comp_ch = compute_check_digit_(); // check the check digit if ((desc_ch != -999) && (desc_ch != comp_ch)) @@ -1643,16 +1647,15 @@ void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_ori void integrate_cell(int ix, int iy, int iz, size_t xextent, size_t yextent, size_t zextent, FFTW_REAL *output_values, double *results) { - /*///////////////////////////////////////////////////////////////////////////// - - This function computes the integral over a cell of the product of the -Panphasia field with an 'analysing' Legendre polynomial. As the -integrand is a polynomial, Gaussian quadrature can be used for -integration as it is exact up to rounding error provide p_order -is less than 10. - -/*/ - ///////////////////////////////////////////////////////////////////////////*/ +///////////////////////////////////////////////////////////////////////////// +// +// This function computes the integral over a cell of the product of the +// Panphasia field with an 'analysing' Legendre polynomial. As the +// integrand is a polynomial, Gaussian quadrature can be used for +// integration as it is exact up to rounding error provide p_order +// is less than 10. +// +///////////////////////////////////////////////////////////////////////////// const double GQ_weights[5] = {0.2955242247147529, 0.2692667193099963, 0.2190863625159820, 0.1494513491505806, @@ -1830,14 +1833,14 @@ void compute_sph_bessel_coeffs(int nfft, int pmax, int n4dimen, int fdim, double const double pi = 4.0 * atan(1.0); for (int l = 0; l <= pmax; l++) { - double norm = sqrt((double)(2 * l + 1) * fdim); + double norm = sqrt((double)(2 * l + 1)); double complex phase_shift = cpow(-I, l); for (int i = 0; i < nfft; i++) { int j = (i <= nfft / 2) ? i : i - nfft; int k = abs(j); double sign = (j < 0) ? pow(-1.0, l) : 1.0; - double x = pi * (double)k / (double)(nfft * fdim); + double x = pi*(double)fdim*(double)k/(double)nfft; double result; spherical_bessel_(&l, &x, &result); diff --git a/external/panphasia_ho/main.c b/external/panphasia_ho/main.c index b9aa590..641be67 100644 --- a/external/panphasia_ho/main.c +++ b/external/panphasia_ho/main.c @@ -24,6 +24,8 @@ int PANPHASIA_HO_main(void) int error; size_t x0 = 0, y0 = 0, z0 = 0; size_t rel_level; + int fdim=1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid + char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; PANPHASIA_init_descriptor_(descriptor, &verbose); @@ -40,9 +42,9 @@ int PANPHASIA_HO_main(void) ptrdiff_t alloc_local, local_n0, local_0_start; - ptrdiff_t N0 = descriptor_base_size << rel_level; + ptrdiff_t N0 = fdim*(descriptor_base_size << rel_level); - alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0, MPI_COMM_WORLD, &local_n0, &local_0_start); + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 +2 , MPI_COMM_WORLD, &local_n0, &local_0_start); FFTW_COMPLEX *Panphasia_White_Noise_Field; @@ -54,6 +56,8 @@ int PANPHASIA_HO_main(void) }; fftw_free(Panphasia_White_Noise_Field); + + return(0); } #ifdef STANDALONE_PANPHASIA_HO @@ -108,7 +112,7 @@ int main(int argc, char **argv) ptrdiff_t N0 = descriptor_base_size << rel_level; - alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0, MPI_COMM_WORLD, &local_n0, &local_0_start); + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0+2, MPI_COMM_WORLD, &local_n0, &local_0_start); FFTW_COMPLEX *Panphasia_White_Noise_Field; @@ -125,6 +129,9 @@ int main(int argc, char **argv) //==================== End FFTW =========================== MPI_Finalize(); + return(0); } + + #endif // STANDALONE_PANPHASIA_HO diff --git a/external/panphasia_ho/pan_mpi_routines.c b/external/panphasia_ho/pan_mpi_routines.c index 1e38c07..ff093d1 100644 --- a/external/panphasia_ho/pan_mpi_routines.c +++ b/external/panphasia_ho/pan_mpi_routines.c @@ -19,75 +19,259 @@ extern const int irank_p[3][84]; extern size_t descriptor_order; extern size_t descriptor_kk_limit; +extern size_t descriptor_base_size; - -int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_grid, - ptrdiff_t local_n0_return, ptrdiff_t local_0_start_return, +int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_fourier_grid, + ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return, FFTW_COMPLEX *return_field) { size_t copy_list[Nbasis]; -int fdim=1; + + + + int pmax = 6; + + int nsubdivide = (pmax%2==0)?pmax+1:pmax+2; + size_t ncopy = (pmax+1)*(pmax+2)*(pmax+3)/6; -size_t xorigin=local_0_start_return, yorigin=0, zorigin=0; -size_t xextent =local_n0_return, yextent = N0_grid, zextent = N0_grid; + +if (ncopy%nsubdivide!=0) return(100010); +int nchunk = ncopy/nsubdivide; + int verbose = 1; int flag_output_mode=2; int error; -ptrdiff_t size_to_alloc; +ptrdiff_t size_to_alloc_fourier; +ptrdiff_t size_to_alloc_pan; +ptrdiff_t local_n0_fourier_xoffset; FFTW_PLAN output_coeff_forward_plan; +ptrdiff_t N0_pan_grid = descriptor_base_size<descriptor_order) return(100000); +if (N0_fourier_grid%N0_pan_grid!=0) return (100015); + +int fdim = N0_fourier_grid/N0_pan_grid; +size_t nfft_dim = N0_fourier_grid; +size_t npan_dim = N0_pan_grid; + +int SHARED_FOUR_PAN_SPACE = (nsubdivide==1)&&(fdim==1)&&(sizeof(PAN_REAL)==sizeof(FFTW_REAL)); + + + +//////////////////////////////////////////////////////////////////////////////////// + +if (pmax>descriptor_order) return(100020); for (size_t i=0; infft_dim/2) ? - ix + local_0_start_return - nfft_dim : ix + local_0_start_return; + index1 = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; + kx = (ix+local_0_start_fourier_return>nfft_dim/2) ? + ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; @@ -224,14 +396,19 @@ int m; // Set Nyquist modes to zero - not used by IC_Gen anyway. phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift }else{ - phase_shift_and_scale = - cexp( (-I)*pi*(double)(kx + ky + kz)/(double)nfft_dim)/pow((double)nfft_dim,1.5); + phase_shift_and_scale = sqrt( (double)(fdim*fdim*fdim))* + cexp( (double)fdim * (-I)*pi*(double)(kx + ky + kz)/ + (double)nfft_dim)/pow((double)nfft_dim,1.5); }; return_field[index1] *= phase_shift_and_scale; + if (ptr_mode_weightings[index1]nfft_dim/2) ? - ix + local_0_start_return - nfft_dim : ix + local_0_start_return; + kx = (ix+local_0_start_fourier_return>nfft_dim/2) ? + ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; ksquared = kx*kx + ky*ky + kz*kz; if (ksquared<=descriptor_kk_limit){ - index1 = ix*N0_grid*(N0_grid/2+1) + iy*(N0_grid/2+1) + iz; + index1 = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; weight = cabs(return_field[index1]); return_field[index1] /= weight; }; @@ -271,65 +448,77 @@ int m; printf("Reached here 12!\n"); -if (nfft_dim <128){ - - int rank; +int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); char filename[100]; - sprintf(filename,"output_k_space_field.%d",rank); - - int xuse,yuse,zuse; - FFTW_REAL sign; + sprintf(filename,"output_k_space_alt.%d",rank); + FILE *fp; + + + +if (nfft_dim <128){ + + FILE *fp; fp = fopen(filename,"w"); - for (int ix=0; ixnfft_dim/2){ - xuse = (nfft_dim-ix)%nfft_dim; - yuse = (nfft_dim-iy)%nfft_dim; - zuse = (nfft_dim-iz)%nfft_dim; - sign = -1.0; - }else{ - xuse = ix; - yuse = iy; - zuse = iz; - sign = 1.0; - }; - - int index = xuse*N0_grid*(N0_grid/2+1) + yuse*(N0_grid/2+1) + zuse; - fprintf(fp,"%6d%6d%6d %14.8lf %14.8lf\n",ix+local_0_start_return,iy,iz, - creal(return_field[index]),cimag(sign*return_field[index])); + int index = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; + fprintf(fp,"%6ld%6d%6d %14.8lf %14.8lf %14.8lf \n",ix+local_0_start_fourier_return,iy,iz, + creal(return_field[index]),cimag(return_field[index]),sqrt(ptr_mode_weightings[index])); + // ptr_mode_weightings[index]); }; fclose(fp); - }; + }else{ + + fp = fopen(filename,"w"); + + for (int ix=0; ix Date: Tue, 11 May 2021 11:53:17 +0200 Subject: [PATCH 05/25] changed a char* to const char* --- external/panphasia_ho/high_order_panphasia_routines.c | 2 +- external/panphasia_ho/panphasia_functions.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c index c2c471a..054194d 100644 --- a/external/panphasia_ho/high_order_panphasia_routines.c +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -1235,7 +1235,7 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta //====================================================================================== //====================================================================================== -int parse_and_validate_descriptor_(char *descriptor, int *pan_mode) +int parse_and_validate_descriptor_(const char *descriptor, int *pan_mode) { char *token; diff --git a/external/panphasia_ho/panphasia_functions.h b/external/panphasia_ho/panphasia_functions.h index 7826146..e7622a4 100644 --- a/external/panphasia_ho/panphasia_functions.h +++ b/external/panphasia_ho/panphasia_functions.h @@ -49,7 +49,7 @@ void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size void return_root_legendre_coefficients_(PAN_REAL *root); -int parse_and_validate_descriptor_(char *, int *); +int parse_and_validate_descriptor_(const char *, int *); int demo_descriptor_(); long long int compute_check_digit_(); int PANPHASIA_init_descriptor_(char *descriptor, int *verbose); From 69f9772cc51b0b6cd931de3984b1c119076f48ce Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Tue, 11 May 2021 11:53:40 +0200 Subject: [PATCH 06/25] added call to see if old or new descriptor is used --- example.conf | 3 ++- src/plugins/random_panphasia_ho.cc | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/example.conf b/example.conf index 00abeeb..0f124a2 100644 --- a/example.conf +++ b/example.conf @@ -97,7 +97,8 @@ ztarget = 2.5 # target redshift for CLASS module, output at ## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php generator = PANPHASIA_HO -descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL] +#descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL] +descriptor = [Panph6,L20,(424060,82570,148256),S1,CH-999,Auriga_100_vol2] # PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512) ##> The MUSIC1 multi-scale random number generator is provided for convenience diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 99d36a5..350ae2a 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -41,6 +41,7 @@ extern "C"{ int PANPHASIA_HO_main( void ); + int parse_and_validate_descriptor_(const char *, int *); } @@ -50,12 +51,12 @@ private: protected: std::string descriptor_string_; int num_threads_; - + int panphasia_mode_; + size_t grid_res_; public: explicit RNG_panphasia_ho(config_file &cf) : RNG_plugin(cf) { - descriptor_string_ = pcf_->get_value("random", "descriptor"); #ifdef _OPENMP num_threads_ = omp_get_max_threads(); @@ -63,10 +64,31 @@ public: num_threads_ = 1; #endif - PANPHASIA_HO_main(); + descriptor_string_ = pcf_->get_value("random", "descriptor"); + grid_res_ = pcf_->get_value("setup","GridRes"); + + panphasia_mode_ = 0; + parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); + + if( panphasia_mode_ == 0 ){ + std::cout << "PANPHASIA: Old descriptor" << std::endl; + }else if( panphasia_mode_ == 1 ){ + std::cout << "PANPHASIA: New descriptor" << std::endl; + PANPHASIA_HO_main(); + }else{ + std::cout << "PANPHASIA: Something went wrong with descriptor" << std::endl; + abort(); + } } - ~RNG_panphasia_ho() { } + ~RNG_panphasia_ho() + { + if( panphasia_mode_ == 0) // old + { + } + + + } bool isMultiscale() const { return true; } From 974900dc81d0727def4c1cdd7077286d43056149 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Tue, 11 May 2021 12:26:18 +0100 Subject: [PATCH 07/25] Added define statements for USE_PRECISION_FLOAT to make both the FFTW and Panphasia routines either both single precision or both double precision. :wq --- external/panphasia_ho/PAN_FFTW3.h | 8 ++++++++ external/panphasia_ho/panphasia_functions.h | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/external/panphasia_ho/PAN_FFTW3.h b/external/panphasia_ho/PAN_FFTW3.h index 687d43a..c4d0465 100644 --- a/external/panphasia_ho/PAN_FFTW3.h +++ b/external/panphasia_ho/PAN_FFTW3.h @@ -1,6 +1,14 @@ // Define macros for FFTW3 to allow swapping // between single/double precision FTs +#ifndef USE_PRECISION_FLOAT + +#define FOURIER_DOUBLE + +#endif + + + #define FOURIER_DOUBLE #ifdef FOURIER_DOUBLE diff --git a/external/panphasia_ho/panphasia_functions.h b/external/panphasia_ho/panphasia_functions.h index e7622a4..77327cb 100644 --- a/external/panphasia_ho/panphasia_functions.h +++ b/external/panphasia_ho/panphasia_functions.h @@ -3,8 +3,10 @@ // By default Panphasia is computed at single // precision. To override this define PAN_DOUBLE -//#define PAN_DOUBLE_PRECISION 8 +#ifndef USE_PRECISION_FLOAT +#define PAN_DOUBLE_PRECISION 8 +#endif #ifndef PAN_DOUBLE_PRECISION From 5a7aeffabe531fe146142454b07efcfefe16121e Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Tue, 11 May 2021 14:51:53 +0100 Subject: [PATCH 08/25] Edited main.c to receive the descriptor and particle load grid, and to choose the correct relative level for the Panphasia field. --- .../panphasia_ho/high_order_panphasia_routines.c | 2 +- external/panphasia_ho/main.c | 13 ++++++++++--- external/panphasia_ho/panphasia_functions.h | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c index 054194d..672c575 100644 --- a/external/panphasia_ho/high_order_panphasia_routines.c +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -729,7 +729,7 @@ int demo_descriptor_() return (0); }; -int PANPHASIA_init_descriptor_(char *descriptor, int *verbose) +int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose) { int error; int verb; diff --git a/external/panphasia_ho/main.c b/external/panphasia_ho/main.c index 641be67..d9c5400 100644 --- a/external/panphasia_ho/main.c +++ b/external/panphasia_ho/main.c @@ -18,7 +18,7 @@ int number_omp_threads = 1; #endif // does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC) -int PANPHASIA_HO_main(void) +int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load) { int verbose = 0; int error; @@ -26,11 +26,18 @@ int PANPHASIA_HO_main(void) size_t rel_level; int fdim=1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid - char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; PANPHASIA_init_descriptor_(descriptor, &verbose); - rel_level = 6; //Set size of test dataset + printf("Descriptor %s\n ngrid_load %llu\n",descriptor,*ngrid_load); + + // Choose smallest value of level to equal of exceed *ngrid_load) + + for (rel_level=0; fdim*(descriptor_base_size<<(rel_level+1))<=*ngrid_load; rel_level++); + + printf("Setting relative level = %llu\n",rel_level); + if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)) { diff --git a/external/panphasia_ho/panphasia_functions.h b/external/panphasia_ho/panphasia_functions.h index 77327cb..01a579e 100644 --- a/external/panphasia_ho/panphasia_functions.h +++ b/external/panphasia_ho/panphasia_functions.h @@ -54,7 +54,7 @@ void return_root_legendre_coefficients_(PAN_REAL *root); int parse_and_validate_descriptor_(const char *, int *); int demo_descriptor_(); long long int compute_check_digit_(); -int PANPHASIA_init_descriptor_(char *descriptor, int *verbose); +int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose); int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose); From a787014b5fcadf7c276709feba3d051d52d83f63 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Tue, 11 May 2021 14:55:51 +0100 Subject: [PATCH 09/25] Edit plugin for panphasia_ho to pass descriptor and particle load grid. Please enter the commit message for your changes. Lines starting --- src/plugins/random_panphasia_ho.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 350ae2a..41fbeb7 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -40,7 +40,7 @@ #include extern "C"{ - int PANPHASIA_HO_main( void ); + int PANPHASIA_HO_main( const char *descriptor, size_t *ngrid); int parse_and_validate_descriptor_(const char *, int *); } @@ -74,7 +74,7 @@ public: std::cout << "PANPHASIA: Old descriptor" << std::endl; }else if( panphasia_mode_ == 1 ){ std::cout << "PANPHASIA: New descriptor" << std::endl; - PANPHASIA_HO_main(); + PANPHASIA_HO_main(descriptor_string_.c_str(),&grid_res_); }else{ std::cout << "PANPHASIA: Something went wrong with descriptor" << std::endl; abort(); @@ -102,4 +102,4 @@ namespace { RNG_plugin_creator_concrete creator("PANPHASIA_HO"); } -#endif // defined(USE_PANPHASIA_HO) \ No newline at end of file +#endif // defined(USE_PANPHASIA_HO) From 0ffa733344a237b09171ddba961a6db2cf9214f7 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Fri, 14 May 2021 13:57:57 +0200 Subject: [PATCH 10/25] moved all routines from main.c to the plugin, added external as included path --- CMakeLists.txt | 4 +- external/panphasia_ho/panphasia_functions.h | 8 +- src/plugins/random_panphasia_ho.cc | 83 ++++++++++++++++----- 3 files changed, 73 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b6b30f..f0c801f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,7 +153,7 @@ endif(ENABLE_PANPHASIA) option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON) ######################################################################################################################## # INCLUDES -include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/external) # SOURCES # get all the *.cc files in the subfolders @@ -289,4 +289,4 @@ if(ENABLE_GENERICIO) include(${CMAKE_CURRENT_SOURCE_DIR}/external/genericio.cmake) target_link_libraries(${PRGNAME} PRIVATE genericio::genericio_mpi) target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_GENERICIO") -endif() \ No newline at end of file +endif() diff --git a/external/panphasia_ho/panphasia_functions.h b/external/panphasia_ho/panphasia_functions.h index 01a579e..852f5c8 100644 --- a/external/panphasia_ho/panphasia_functions.h +++ b/external/panphasia_ho/panphasia_functions.h @@ -3,6 +3,7 @@ // By default Panphasia is computed at single // precision. To override this define PAN_DOUBLE +#pragma once #ifndef USE_PRECISION_FLOAT #define PAN_DOUBLE_PRECISION 8 @@ -21,7 +22,7 @@ #endif - +#include "PAN_FFTW3.h" ///////////////////////////////////////////////////////////////////// @@ -89,7 +90,10 @@ int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinate - +#ifdef __cplusplus +void compute_sph_bessel_coeffs(int, int, int, int, std::complex* *); +#else void compute_sph_bessel_coeffs(int, int, int, int, double complex *); +#endif int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *); diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 41fbeb7..85f7280 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -2,17 +2,17 @@ // A software package to generate ICs for cosmological simulations // Copyright (C) 2021 by Oliver Hahn and Adrian Jenkins (this file) // but see distinct licensing for PANPHASIA below -// +// // monofonIC is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. -// +// // monofonIC is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. -// +// // You should have received a copy of the GNU General Public License // along with this program. If not, see . // @@ -39,12 +39,12 @@ #include -extern "C"{ - int PANPHASIA_HO_main( const char *descriptor, size_t *ngrid); - int parse_and_validate_descriptor_(const char *, int *); +extern "C" +{ + #include + extern size_t descriptor_base_size; } - class RNG_panphasia_ho : public RNG_plugin { private: @@ -65,36 +65,83 @@ public: #endif descriptor_string_ = pcf_->get_value("random", "descriptor"); - grid_res_ = pcf_->get_value("setup","GridRes"); + grid_res_ = pcf_->get_value("setup", "GridRes"); panphasia_mode_ = 0; parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); - if( panphasia_mode_ == 0 ){ + if (panphasia_mode_ == 0) + { std::cout << "PANPHASIA: Old descriptor" << std::endl; - }else if( panphasia_mode_ == 1 ){ + } + else if (panphasia_mode_ == 1) + { std::cout << "PANPHASIA: New descriptor" << std::endl; - PANPHASIA_HO_main(descriptor_string_.c_str(),&grid_res_); - }else{ + + int verbose = 0; + int error; + size_t x0 = 0, y0 = 0, z0 = 0; + size_t rel_level; + int fdim = 1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid + + //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + + PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); + + printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); + + // Choose smallest value of level to equal of exceed grid_res_) + + for (rel_level = 0; fdim * (descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) + ; + + printf("Setting relative level = %lu\n", rel_level); + + if ((error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))) + { + printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); + abort(); + }; + + //======================= FFTW ============================== + + ptrdiff_t alloc_local, local_n0, local_0_start; + + ptrdiff_t N0 = fdim * (descriptor_base_size << rel_level); + + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start); + + FFTW_COMPLEX *Panphasia_White_Noise_Field; + + Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); + + if ((error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) + { + printf("Error code from PANPHASIA_compute ... %d\n", error); + }; + + fftw_free(Panphasia_White_Noise_Field); + + // PANPHASIA_HO_main(descriptor_string_.c_str(),&grid_res_); + } + else + { std::cout << "PANPHASIA: Something went wrong with descriptor" << std::endl; abort(); } } - ~RNG_panphasia_ho() - { - if( panphasia_mode_ == 0) // old + ~RNG_panphasia_ho() + { + if (panphasia_mode_ == 0) // old { } - - } bool isMultiscale() const { return true; } void Fill_Grid(Grid_FFT &g) { - } }; From 8aec69634087c1fde7bae5638c00fcd828c00581 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Fri, 14 May 2021 15:42:04 +0200 Subject: [PATCH 11/25] fixed memset out of bounds (missing factor 1/2) --- external/panphasia_ho/pan_mpi_routines.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/external/panphasia_ho/pan_mpi_routines.c b/external/panphasia_ho/pan_mpi_routines.c index ff093d1..88f67de 100644 --- a/external/panphasia_ho/pan_mpi_routines.c +++ b/external/panphasia_ho/pan_mpi_routines.c @@ -286,8 +286,7 @@ printf("Plan completed ... \n"); ////////////////////////////////////////////////////////////////////////// //---------------------------------------------------------------------------------- - -memset(return_field, 0, local_n0_fourier_return*N0_fourier_grid *(N0_fourier_grid +2) * sizeof(FFTW_COMPLEX)); +memset(return_field, 0, local_n0_fourier_return*N0_fourier_grid *(N0_fourier_grid +2)/2 * sizeof(FFTW_COMPLEX)); for (int iter = 0; iter < nsubdivide; iter++){ From 6a40a29edae6c845346e8d7fcd24315638b8d388 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Fri, 14 May 2021 15:57:31 +0200 Subject: [PATCH 12/25] PANPHASIA_HO plugin now executes calls formerly in main.c on a Grid_FFT object, result is Fourier interpolated back to size required by monofonic, modes are passed through to LPT module, not tested yet --- src/plugins/random_panphasia_ho.cc | 142 ++++++++++++++++------------- 1 file changed, 78 insertions(+), 64 deletions(-) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 85f7280..80a0e6a 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -41,7 +41,7 @@ extern "C" { - #include +#include extern size_t descriptor_base_size; } @@ -53,6 +53,7 @@ protected: int num_threads_; int panphasia_mode_; size_t grid_res_; + real_t boxlength_; public: explicit RNG_panphasia_ho(config_file &cf) : RNG_plugin(cf) @@ -66,85 +67,98 @@ public: descriptor_string_ = pcf_->get_value("random", "descriptor"); grid_res_ = pcf_->get_value("setup", "GridRes"); + boxlength_ = pcf_->get_value("setup", "BoxLength"); panphasia_mode_ = 0; parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); - - if (panphasia_mode_ == 0) - { - std::cout << "PANPHASIA: Old descriptor" << std::endl; - } - else if (panphasia_mode_ == 1) - { - std::cout << "PANPHASIA: New descriptor" << std::endl; - - int verbose = 0; - int error; - size_t x0 = 0, y0 = 0, z0 = 0; - size_t rel_level; - int fdim = 1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid - - //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; - - PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); - - printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); - - // Choose smallest value of level to equal of exceed grid_res_) - - for (rel_level = 0; fdim * (descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) - ; - - printf("Setting relative level = %lu\n", rel_level); - - if ((error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))) - { - printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); - abort(); - }; - - //======================= FFTW ============================== - - ptrdiff_t alloc_local, local_n0, local_0_start; - - ptrdiff_t N0 = fdim * (descriptor_base_size << rel_level); - - alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start); - - FFTW_COMPLEX *Panphasia_White_Noise_Field; - - Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); - - if ((error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) - { - printf("Error code from PANPHASIA_compute ... %d\n", error); - }; - - fftw_free(Panphasia_White_Noise_Field); - - // PANPHASIA_HO_main(descriptor_string_.c_str(),&grid_res_); - } - else - { - std::cout << "PANPHASIA: Something went wrong with descriptor" << std::endl; - abort(); - } } ~RNG_panphasia_ho() { - if (panphasia_mode_ == 0) // old - { - } } bool isMultiscale() const { return true; } + void Run_Panphasia_Highorder(Grid_FFT &g); + void Fill_Grid(Grid_FFT &g) { + switch( panphasia_mode_ ){ + + case 0: // old mode + music::ilog << "PANPHASIA: Old descriptor" << std::endl; + break; + + case 1: // PANPHASIA HO descriptor + music::ilog << "PANPHASIA: New descriptor" << std::endl; + this->Run_Panphasia_Highorder( g ); + break; + + default: // unknown PANPHASIA mode + music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl; + abort(); + break; + } } }; +void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) +{ + int verbose = 0; + int error; + size_t x0 = 0, y0 = 0, z0 = 0; + size_t rel_level; + int fdim = 1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid + + //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + + PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); + + printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); + + // Choose smallest value of level to equal of exceed grid_res_) + + for (rel_level = 0; fdim * (descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) + ; + + printf("Setting relative level = %lu\n", rel_level); + + if ((error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))) + { + printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); + abort(); + }; + + //======================= FFTW ============================== + + ptrdiff_t alloc_local, local_n0, local_0_start; + + size_t N0 = fdim * (descriptor_base_size << rel_level); + + alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start); + + Grid_FFT pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}}); + + assert(pan_grid.n_[0] == N0); + assert(pan_grid.n_[1] == N0); + assert(pan_grid.n_[2] == N0); + assert(pan_grid.local_0_start_ == local_0_start); + assert(pan_grid.local_0_size_ == local_n0); + assert(pan_grid.ntot_ == alloc_local); + + pan_grid.FourierTransformForward(false); + + FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast(&pan_grid.data_[0]); + + // Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); + + if ((error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) + { + music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl; + }; + + pan_grid.FourierInterpolateCopyTo( g ); +} namespace { RNG_plugin_creator_concrete creator("PANPHASIA_HO"); From 316b187166bdad2e404bcd9ed7a37267a09bbe8c Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Fri, 14 May 2021 22:42:04 +0200 Subject: [PATCH 13/25] added old PANPHASIA to the new plugin, which can now deal with both old and new descriptors --- src/plugins/PANPHASIA.hh | 498 +++++++++++++++++++++++++++++ src/plugins/random_panphasia_ho.cc | 62 ++-- 2 files changed, 538 insertions(+), 22 deletions(-) create mode 100644 src/plugins/PANPHASIA.hh diff --git a/src/plugins/PANPHASIA.hh b/src/plugins/PANPHASIA.hh new file mode 100644 index 0000000..8ebe5b0 --- /dev/null +++ b/src/plugins/PANPHASIA.hh @@ -0,0 +1,498 @@ +#pragma once + +namespace PANPHASIA1 +{ + const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim; + typedef int rand_offset_[5]; + typedef struct + { + int state[133]; // Nstore = Nstate (=5) + Nbatch (=128) + int need_fill; + int pos; + } rand_state_; + + typedef struct + { + int base_state[5], base_lev_start[5][maxdim + 1]; + rand_offset_ poweroffset[maxpow + 1], superjump; + rand_state_ current_state[maxpow + 2]; + + int layer_min, layer_max, indep_field; + + long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2]; + int lev_common, layer_min_store, layer_max_store; + long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store, + iy_rel_store, iz_rel_store; + double exp_coeffs[8][8][maxdim + 2]; + long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1]; + int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2]; + + double cell_data[9][8]; + int ixh_last, iyh_last, izh_last; + int init; + + int init_cell_props; + int init_lecuyer_state; + long long p_xcursor[62], p_ycursor[62], p_zcursor[62]; + + } pan_state_; + + extern "C" + { + void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose); + + void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1, + int16_t *side2, int16_t *side3, int32_t *check_int, char *name); + + void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop); + + void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min, + int *layer_max, int *indep_field, double *cell_prop); + + void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel, + long long *iy_rel, long long *iz_rel, int *VERBOSE); + } + + struct RNG + { + + config_file *pcf_; + + struct panphasia_descriptor + { + int16_t wn_level_base; + int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base; + int16_t i_base, i_base_y, i_base_z; + int32_t check_rand; + std::string name; + + explicit panphasia_descriptor(std::string dstring) + { + char tmp[100]; + std::memset(tmp, ' ', 100); + parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base, + &i_base_y, &i_base_z, &check_rand, tmp); + for (int i = 0; i < 100; i++) + if (tmp[i] == ' ') + { + tmp[i] = '\0'; + break; + } + name = tmp; + name.erase(std::remove(name.begin(), name.end(), ' '), name.end()); + } + }; + + // greatest common divisor + int gcd(int a, int b) + { + if (b == 0) + return a; + return gcd(b, a % b); + } + + // least common multiple + int lcm(int a, int b) { return abs(a * b) / gcd(a, b); } + + // Two or largest power of 2 less than the argument + int largest_power_two_lte(int b) + { + int a = 1; + if (b <= a) + return a; + while (2 * a < b) + a = 2 * a; + return a; + } + + std::string descriptor_string_; + int num_threads_; + int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_; + bool incongruent_fields_; + double inter_grid_phase_adjustment_; + // double translation_phase_; + pan_state_ *lstate; + int grid_p_; + int coordinate_system_shift_[3]; + int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_; + + void clear_panphasia_thread_states(void) + { + for (int i = 0; i < num_threads_; ++i) + { + lstate[i].init = 0; + lstate[i].init_cell_props = 0; + lstate[i].init_lecuyer_state = 0; + } + } + + void initialize_for_grid_structure(void) + { + // if ngrid is not a multiple of i_base, then we need to enlarge and then sample down + ngrid_ = pcf_->get_value("setup", "GridRes"); + int ngridminsize_panphasia = pcf_->get_value_safe("random", "PanphasiaMinRootResolution", 512); + + grid_p_ = pdescriptor_->i_base; + + lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0); + // lmin + + ngrid_panphasia_ = (1 << lextra_) * grid_p_; + + while (ngrid_panphasia_ < ngridminsize_panphasia) + { + lextra_++; + ngrid_panphasia_ *= 2; + } + assert(ngrid_panphasia_ >= ngridminsize_panphasia); + + clear_panphasia_thread_states(); + + music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)", ngrid_panphasia_, lextra_); + if (ngridminsize_panphasia < 512) + music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512", ngridminsize_panphasia); + music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_); + + coordinate_system_shift_[0] = -pcf_->get_value_safe("setup", "shift_x", 0); + coordinate_system_shift_[1] = -pcf_->get_value_safe("setup", "shift_y", 0); + coordinate_system_shift_[2] = -pcf_->get_value_safe("setup", "shift_z", 0); + } + + std::unique_ptr pdescriptor_; + + RNG(config_file *pcf) + : pcf_(pcf) + { + descriptor_string_ = pcf_->get_value("random", "descriptor"); + +#ifdef _OPENMP + num_threads_ = omp_get_max_threads(); +#else + num_threads_ = 1; +#endif + + // create independent state descriptions for each thread + lstate = new pan_state_[num_threads_]; + + // parse the descriptor for its properties + pdescriptor_ = std::make_unique(descriptor_string_); + + music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base); + + // write panphasia base size into config file for the grid construction + // as the gridding unit we use the least common multiple of 2 and i_base + std::stringstream ss; + //ARJ ss << lcm(2, pdescriptor_->i_base); + //ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ + ss << 2; //ARJ - set gridding unit to two + pcf_->insert_value("setup", "gridding_unit", ss.str()); + ss.str(std::string()); + ss << pdescriptor_->i_base; + pcf_->insert_value("random", "base_unit", ss.str()); + + this->initialize_for_grid_structure(); + } + + ~RNG() { delete[] lstate; } + + void Fill(Grid_FFT &g) + { + auto sinc = [](real_t x) + { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; }; + auto dsinc = [](real_t x) + { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; }; + const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)}; + + // we will overwrite 'g', we can deallocate it while we prepare the panphasia field + g.reset(); + + clear_panphasia_thread_states(); + + // temporaries + Grid_FFT g0({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_); + Grid_FFT g1({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_); + Grid_FFT g2({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_); + Grid_FFT g3({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_); + Grid_FFT g4({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_); + + double t1 = get_wtime(); + // double tp = t1; + +#pragma omp parallel + { +#ifdef _OPENMP + const int mythread = omp_get_thread_num(); +#else + const int mythread = 0; +#endif + + //int odd_x, odd_y, odd_z; + //int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level + + int verbosity = (mythread == 0); + char descriptor[100]; + std::memset(descriptor, 0, 100); + std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size()); + + start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity); + + { + panphasia_descriptor d(descriptor_string_); + + int level_p = d.wn_level_base + lextra_; + + lstate[mythread].layer_min = 0; + lstate[mythread].layer_max = level_p; + lstate[mythread].indep_field = 1; + + long long ix_rel[3]; + ix_rel[0] = 0; //ileft_corner_p[0]; + ix_rel[1] = 0; //ileft_corner_p[1]; + ix_rel[2] = 0; //ileft_corner_p[2]; + + set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2], + &verbosity); + } + + if (verbosity) + t1 = get_wtime(); + + std::array cell_prop; + pan_state_ *ps = &lstate[mythread]; + +#pragma omp for //nowait + for (size_t i = 0; i < g0.size(0); i += 2) + { + const int ixmax(std::min(2, g0.size(0) - i)); + for (size_t j = 0; j < g0.size(1); j += 2) + { + const int iymax(std::min(2, g0.size(1) - j)); + for (size_t k = 0; k < g0.size(2); k += 2) + { + const int izmax(std::min(2, g0.size(2) - k)); + + // ARJ - added inner set of loops to speed up evaluation of Panphasia + for (int ix = 0; ix < ixmax; ++ix) + { + for (int iy = 0; iy < iymax; ++iy) + { + for (int iz = 0; iz < izmax; ++iz) + { + int ilocal = i + ix; + int jlocal = j + iy; + int klocal = k + iz; + + int iglobal = ilocal + g0.local_0_start_; + int jglobal = jlocal; + int kglobal = klocal; + + adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min, + &ps->layer_max, &ps->indep_field, &cell_prop[0]); + + g0.relem(ilocal, jlocal, klocal) = cell_prop[0]; + g1.relem(ilocal, jlocal, klocal) = cell_prop[4]; + g2.relem(ilocal, jlocal, klocal) = cell_prop[2]; + g3.relem(ilocal, jlocal, klocal) = cell_prop[1]; + g4.relem(ilocal, jlocal, klocal) = cell_prop[8]; + } + } + } + } + } + } + } // end omp parallel region + + g0.FourierTransformForward(); + g1.FourierTransformForward(); + g2.FourierTransformForward(); + g3.FourierTransformForward(); + g4.FourierTransformForward(); + +#pragma omp parallel for + for (size_t i = 0; i < g0.size(0); i++) + { + for (size_t j = 0; j < g0.size(1); j++) + { + for (size_t k = 0; k < g0.size(2); k++) + { + if (!g0.is_nyquist_mode(i, j, k)) + { + auto kvec = g0.get_k(i, j, k); + + auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0]; + auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1]; + auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2]; + + auto fx = real_t(sinc(argx)); + auto gx = ccomplex_t(0.0, dsinc(argx)); + auto fy = real_t(sinc(argy)); + auto gy = ccomplex_t(0.0, dsinc(argy)); + auto fz = real_t(sinc(argz)); + auto gz = ccomplex_t(0.0, dsinc(argz)); + + auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz); + auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp))); + + auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k)); + + g0.kelem(i, j, k) = y0 * fx * fy * fz + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) + y4 * magnitude; + } + else + { + g0.kelem(i, j, k) = 0.0; + } + } + } + } + + // music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp); + // tp = get_wtime(); + + g1.FourierTransformBackward(false); + g2.FourierTransformBackward(false); + g3.FourierTransformBackward(false); + g4.FourierTransformBackward(false); + +#pragma omp parallel + { +#ifdef _OPENMP + const int mythread = omp_get_thread_num(); +#else + const int mythread = 0; +#endif + + // int odd_x, odd_y, odd_z; + int verbosity = (mythread == 0); + char descriptor[100]; + std::memset(descriptor, 0, 100); + std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size()); + + start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity); + + { + panphasia_descriptor d(descriptor_string_); + + int level_p = d.wn_level_base + lextra_; + + lstate[mythread].layer_min = 0; + lstate[mythread].layer_max = level_p; + lstate[mythread].indep_field = 1; + + long long ix_rel[3]; + ix_rel[0] = 0; //ileft_corner_p[0]; + ix_rel[1] = 0; //ileft_corner_p[1]; + ix_rel[2] = 0; //ileft_corner_p[2]; + + set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2], + &verbosity); + } + + if (verbosity) + t1 = get_wtime(); + + //*************************************************************** + // Process Panphasia values: p110, p011, p101, p111 + //**************************************************************** + std::array cell_prop; + pan_state_ *ps = &lstate[mythread]; + +#pragma omp for //nowait + for (size_t i = 0; i < g1.size(0); i += 2) + { + const int ixmax(std::min(2, g1.size(0) - i)); + for (size_t j = 0; j < g1.size(1); j += 2) + { + const int iymax(std::min(2, g1.size(1) - j)); + for (size_t k = 0; k < g1.size(2); k += 2) + { + const int izmax(std::min(2, g1.size(2) - k)); + + // ARJ - added inner set of loops to speed up evaluation of Panphasia + for (int ix = 0; ix < ixmax; ++ix) + { + for (int iy = 0; iy < iymax; ++iy) + { + for (int iz = 0; iz < izmax; ++iz) + { + int ilocal = i + ix; + int jlocal = j + iy; + int klocal = k + iz; + + int iglobal = ilocal + g1.local_0_start_; + int jglobal = jlocal; + int kglobal = klocal; + + adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min, + &ps->layer_max, &ps->indep_field, &cell_prop[0]); + + g1.relem(ilocal, jlocal, klocal) = cell_prop[6]; + g2.relem(ilocal, jlocal, klocal) = cell_prop[3]; + g3.relem(ilocal, jlocal, klocal) = cell_prop[5]; + g4.relem(ilocal, jlocal, klocal) = cell_prop[7]; + } + } + } + } + } + } + } // end omp parallel region + + // music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp); + // tp = get_wtime(); + + ///////////////////////////////////////////////////////////////////////// + // transform and convolve with Legendres + g1.FourierTransformForward(); + g2.FourierTransformForward(); + g3.FourierTransformForward(); + g4.FourierTransformForward(); + +#pragma omp parallel for + for (size_t i = 0; i < g0.size(0); i++) + { + for (size_t j = 0; j < g0.size(1); j++) + { + for (size_t k = 0; k < g0.size(2); k++) + { + if (!g0.is_nyquist_mode(i, j, k)) + { + auto kvec = g0.get_k(i, j, k); + + auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0]; + auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1]; + auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2]; + + auto fx = real_t(sinc(argx)); + auto gx = ccomplex_t(0.0, dsinc(argx)); + auto fy = real_t(sinc(argy)); + auto gy = ccomplex_t(0.0, dsinc(argy)); + auto fz = real_t(sinc(argz)); + auto gz = ccomplex_t(0.0, dsinc(argz)); + + auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k)); + + g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz; + + // do final phase shift to account for corner centered coordinates vs. cell centers + auto phase_shift = -0.5 * M_PI * (kvec[0] / g0.kny_[0] + kvec[1] / g0.kny_[1] + kvec[2] / g0.kny_[2]); + + g0.kelem(i, j, k) *= std::exp(ccomplex_t(0, phase_shift)); + } + } + } + } + + g1.reset(); + g2.reset(); + g3.reset(); + g4.reset(); + + g.allocate(); + g0.FourierInterpolateCopyTo(g); + + music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1, + 1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2)); + music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std()); + } + }; +}; \ No newline at end of file diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 80a0e6a..5d9dc0c 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -39,12 +39,17 @@ #include -extern "C" +namespace PANPHASIA2 { + extern "C" + { #include - extern size_t descriptor_base_size; + extern size_t descriptor_base_size; + } } +#include "PANPHASIA.hh" + class RNG_panphasia_ho : public RNG_plugin { private: @@ -55,6 +60,8 @@ protected: size_t grid_res_; real_t boxlength_; + PANPHASIA1::RNG *ppan1_rng_; + public: explicit RNG_panphasia_ho(config_file &cf) : RNG_plugin(cf) { @@ -70,11 +77,20 @@ public: boxlength_ = pcf_->get_value("setup", "BoxLength"); panphasia_mode_ = 0; - parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); + PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); + + if (panphasia_mode_ == 0) + { + ppan1_rng_ = new PANPHASIA1::RNG(&cf); + } } ~RNG_panphasia_ho() { + if (panphasia_mode_ == 0) + { + delete ppan1_rng_; + } } bool isMultiscale() const { return true; } @@ -83,21 +99,23 @@ public: void Fill_Grid(Grid_FFT &g) { - switch( panphasia_mode_ ){ + switch (panphasia_mode_) + { - case 0: // old mode - music::ilog << "PANPHASIA: Old descriptor" << std::endl; - break; + case 0: // old mode + music::ilog << "PANPHASIA: Old descriptor" << std::endl; + ppan1_rng_->Fill(g); + break; - case 1: // PANPHASIA HO descriptor - music::ilog << "PANPHASIA: New descriptor" << std::endl; - this->Run_Panphasia_Highorder( g ); - break; + case 1: // PANPHASIA HO descriptor + music::ilog << "PANPHASIA: New descriptor" << std::endl; + this->Run_Panphasia_Highorder(g); + break; - default: // unknown PANPHASIA mode - music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl; - abort(); - break; + default: // unknown PANPHASIA mode + music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl; + abort(); + break; } } }; @@ -112,18 +130,18 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; - PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); + PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); // Choose smallest value of level to equal of exceed grid_res_) - for (rel_level = 0; fdim * (descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) + for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) ; printf("Setting relative level = %lu\n", rel_level); - if ((error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))) + if ((error = PANPHASIA2::PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))) { printf("Abort: PANPHASIA_init_level_ :error code %d\n", error); abort(); @@ -133,7 +151,7 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) ptrdiff_t alloc_local, local_n0, local_0_start; - size_t N0 = fdim * (descriptor_base_size << rel_level); + size_t N0 = fdim * (PANPHASIA2::descriptor_base_size << rel_level); alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start); @@ -144,7 +162,7 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) assert(pan_grid.n_[2] == N0); assert(pan_grid.local_0_start_ == local_0_start); assert(pan_grid.local_0_size_ == local_n0); - assert(pan_grid.ntot_ == alloc_local); + assert(pan_grid.ntot_ == size_t(alloc_local)); pan_grid.FourierTransformForward(false); @@ -152,12 +170,12 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) // Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); - if ((error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) + if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) { music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl; }; - pan_grid.FourierInterpolateCopyTo( g ); + pan_grid.FourierInterpolateCopyTo(g); } namespace { From c13fdc05726b1e3ed16168e49278710869866bc0 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Mon, 17 May 2021 14:29:17 +0100 Subject: [PATCH 14/25] Added a transpose step at the bottom of pan_mpi_routines.c. Fixed an error when fdim !=1 - not currently relevant to this version, but will allow the code to use - say 1/8th as much memory for storing the Panphasia coefficients - at the cost of less accurate reproduction of the phases close to the Nyquist frequency of the Fourier grid. --- external/panphasia_ho/PAN_FFTW3.h | 9 +- external/panphasia_ho/pan_mpi_routines.c | 135 +++++++++++++++-------- 2 files changed, 95 insertions(+), 49 deletions(-) diff --git a/external/panphasia_ho/PAN_FFTW3.h b/external/panphasia_ho/PAN_FFTW3.h index c4d0465..4777626 100644 --- a/external/panphasia_ho/PAN_FFTW3.h +++ b/external/panphasia_ho/PAN_FFTW3.h @@ -8,9 +8,6 @@ #endif - -#define FOURIER_DOUBLE - #ifdef FOURIER_DOUBLE #define FFTW_REAL double #define FFTW_PLAN fftw_plan @@ -24,6 +21,9 @@ #define FFTW_FREE fftw_free #define FFTW_ALLOC_COMPLEX fftw_alloc_complex #define FFTW_MPI_LOCAL_SIZE_MANY fftw_mpi_local_size_many + #define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftw_mpi_local_size_many_transposed + #define FFTW_MPI_PLAN_MANY_TRANSPOSE fftw_mpi_plan_many_transpose + #define FFTW_MPI_EXECUTE_R2R fftw_mpi_execute_r2r #define FFTW_PLAN_MANY_DFT fftw_plan_many_dft #define FFTW_MPI_LOCAL_SIZE_3D fftw_mpi_local_size_3d #define FFTW_MPI_PLAN_MANY_DTF fftw_mpi_plan_many_dft @@ -43,6 +43,9 @@ #define FFTW_FREE fftwf_free #define FFTW_ALLOC_COMPLEX fftwf_alloc_complex #define FFTW_MPI_LOCAL_SIZE_MANY fftwf_mpi_local_size_many + #define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftwf_mpi_local_size_many_transposed + #define FFTW_MPI_PLAN_MANY_TRANSPOSE fftwf_mpi_plan_many_transpose + #define FFTW_MPI_EXECUTE_R2R fftwf_mpi_execute_r2r #define FFTW_PLAN_MANY_DFT fftwf_plan_many_dft #define FFTW_MPI_LOCAL_SIZE_3D fftwf_mpi_local_size_3d #define FFTW_MPI_PLAN_MANY_DTF fftwf_mpi_plan_many_dft diff --git a/external/panphasia_ho/pan_mpi_routines.c b/external/panphasia_ho/pan_mpi_routines.c index 88f67de..744b484 100644 --- a/external/panphasia_ho/pan_mpi_routines.c +++ b/external/panphasia_ho/pan_mpi_routines.c @@ -45,7 +45,7 @@ int flag_output_mode=2; int error; ptrdiff_t size_to_alloc_fourier; ptrdiff_t size_to_alloc_pan; -ptrdiff_t local_n0_fourier_xoffset; + ptrdiff_t local_fourier_x_start, local_fourier_x_end; FFTW_PLAN output_coeff_forward_plan; ptrdiff_t N0_pan_grid = descriptor_base_size<descriptor_order) return(100020); for (size_t i=0; i nfft_dim/2) ? iy-nfft_dim : iy; kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; ksquared = kx*kx + ky*ky + kz*kz; - if (ksquared<=descriptor_kk_limit){ + if ((ksquared<=descriptor_kk_limit)&&(ksquared!=0)){ index1 = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; weight = cabs(return_field[index1]); return_field[index1] /= weight; @@ -444,7 +450,7 @@ int m; }; -printf("Reached here 12!\n"); + //printf("Reached here 12!\n"); int rank; @@ -455,7 +461,7 @@ int rank; -if (nfft_dim <128){ +if (nfft_dim <-1){ @@ -468,7 +474,7 @@ if (nfft_dim <128){ for (int iz=0; iz <= nfft_dim/2; iz++){ int index = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; - fprintf(fp,"%6ld%6d%6d %14.8lf %14.8lf %14.8lf \n",ix+local_0_start_fourier_return,iy,iz, + fprintf(fp,"%6llu%6d%6d %14.8lf %14.8lf %14.8lf \n",ix+local_0_start_fourier_return,iy,iz, creal(return_field[index]),cimag(return_field[index]),sqrt(ptr_mode_weightings[index])); // ptr_mode_weightings[index]); }; @@ -485,7 +491,7 @@ if (nfft_dim <128){ for (int iz=0; iz <= nfft_dim/2; iz++){ if (ix+iy+iz+local_0_start_fourier_return<100){ int index = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; - fprintf(fp,"%6ld%6d%6d %14.8lf %14.8lf %14.8lf \n",ix+local_0_start_fourier_return,iy,iz, + fprintf(fp,"%6llu%6d%6d %14.8lf %14.8lf %14.8lf \n",ix+local_0_start_fourier_return,iy,iz, creal(return_field[index]),cimag(return_field[index]),sqrt(ptr_mode_weightings[index])); // ptr_mode_weightings[index]); }; @@ -496,9 +502,46 @@ if (nfft_dim <128){ }; +// Transpose output field + + { + + FFTW_PLAN transpose_output_plan; + unsigned flags = FFTW_ESTIMATE; + + void *ptr_inter = return_field; + + FFTW_REAL *ptr_return_as_real_field; + ptr_return_as_real_field = ptr_inter; + + + int rank = 2; + + ptrdiff_t size_to_transpose; + ptrdiff_t howmany = N0_fourier_grid + 2; + ptrdiff_t local_n0, local_0_start; + ptrdiff_t local_n1, local_1_start; + + const ptrdiff_t ndimens[] = {N0_fourier_grid, N0_fourier_grid}; + size_to_transpose = FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED(rank, ndimens, howmany, + FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + MPI_COMM_WORLD,&local_n0, &local_0_start,&local_n1, &local_1_start); + // printf("size_to_transpose = %td\n",size_to_transpose); + + transpose_output_plan = FFTW_MPI_PLAN_MANY_TRANSPOSE(N0_fourier_grid, N0_fourier_grid, + howmany, FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, + ptr_return_as_real_field, ptr_return_as_real_field, + MPI_COMM_WORLD, flags); + + //printf("Transpose plan completed.\n"); + + FFTW_MPI_EXECUTE_R2R(transpose_output_plan,ptr_return_as_real_field,ptr_return_as_real_field); + + //printf("Transpose completed.\n"); + }; @@ -516,7 +559,7 @@ if (!SHARED_FOUR_PAN_SPACE) FFTW_FREE(fourier_grids); FFTW_DESTROY_PLAN(output_coeff_forward_plan); -printf("Reached end of PANPHASIA_compute_kspace_field_\n"); +//printf("Reached end of PANPHASIA_compute_kspace_field_\n"); return(0); From 57e6c6300dad59c4c28a5369cd78822b66645bcc Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Thu, 27 May 2021 14:27:40 +0100 Subject: [PATCH 15/25] Corrected bug that chose the wrong Fourier grid size for the case where the particle grid is not a power of two times the descriptor S-value. --- src/plugins/random_panphasia_ho.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 5d9dc0c..fcb3681 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -136,8 +136,7 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) // Choose smallest value of level to equal of exceed grid_res_) - for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size << (rel_level + 1)) <= grid_res_; rel_level++) - ; + for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size < Date: Tue, 1 Jun 2021 21:41:20 +0200 Subject: [PATCH 16/25] fixed compilation of PANPHASIA_HO for single precision --- external/panphasia_ho/PAN_FFTW3.h | 6 +++++- external/panphasia_ho/main.c | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/external/panphasia_ho/PAN_FFTW3.h b/external/panphasia_ho/PAN_FFTW3.h index 4777626..1f30ddf 100644 --- a/external/panphasia_ho/PAN_FFTW3.h +++ b/external/panphasia_ho/PAN_FFTW3.h @@ -1,10 +1,14 @@ // Define macros for FFTW3 to allow swapping // between single/double precision FTs -#ifndef USE_PRECISION_FLOAT +// include CMake controlled configuration settings +#if defined(USE_PRECISION_DOUBLE) #define FOURIER_DOUBLE +#endif +#if defined(USE_PRECISION_LONGDOUBLE) +#error "PANPHASIA-high-order does not currently support long double precision" #endif diff --git a/external/panphasia_ho/main.c b/external/panphasia_ho/main.c index d9c5400..98deefb 100644 --- a/external/panphasia_ho/main.c +++ b/external/panphasia_ho/main.c @@ -17,6 +17,8 @@ int threads_ok; int number_omp_threads = 1; #endif + +#if 0 // this is now unused since all this has been migrated to plugin! // does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC) int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load) { @@ -66,6 +68,7 @@ int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load) return(0); } +#endif #ifdef STANDALONE_PANPHASIA_HO int main(int argc, char **argv) From f3d9c25e1e16b2b576e8afa5db5c80a306092547 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Tue, 1 Jun 2021 21:57:18 +0200 Subject: [PATCH 17/25] more fixes to expose precision selection also to C code. should work now for PANPHASIA_HO --- external/panphasia_ho/PAN_FFTW3.h | 2 ++ include/cmake_config.hh.in | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/external/panphasia_ho/PAN_FFTW3.h b/external/panphasia_ho/PAN_FFTW3.h index 1f30ddf..3f53182 100644 --- a/external/panphasia_ho/PAN_FFTW3.h +++ b/external/panphasia_ho/PAN_FFTW3.h @@ -2,6 +2,8 @@ // between single/double precision FTs // include CMake controlled configuration settings +#pragma once +#include "cmake_config.hh" #if defined(USE_PRECISION_DOUBLE) #define FOURIER_DOUBLE diff --git a/include/cmake_config.hh.in b/include/cmake_config.hh.in index 03768a3..cc0da5e 100644 --- a/include/cmake_config.hh.in +++ b/include/cmake_config.hh.in @@ -1,8 +1,10 @@ #pragma once -constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}"; #define USE_PRECISION_${CODE_PRECISION} + +#ifdef __cplusplus +constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}"; #if defined(USE_PRECISION_FLOAT) constexpr char CMAKE_PRECISION_STR[] = "single"; #elif defined(USE_PRECISION_DOUBLE) @@ -31,4 +33,6 @@ extern "C" extern const char *GIT_TAG; extern const char *GIT_REV; extern const char *GIT_BRANCH; -} \ No newline at end of file + +} +#endif // __cplusplus \ No newline at end of file From f1d628edb622afe3ee9275df7cce96586fe723f3 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Wed, 2 Jun 2021 09:45:43 +0100 Subject: [PATCH 18/25] Changed value of nsubvision from 7 to 21 to save memory --- external/panphasia_ho/pan_mpi_routines.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/panphasia_ho/pan_mpi_routines.c b/external/panphasia_ho/pan_mpi_routines.c index 744b484..7722c9e 100644 --- a/external/panphasia_ho/pan_mpi_routines.c +++ b/external/panphasia_ho/pan_mpi_routines.c @@ -33,7 +33,7 @@ size_t copy_list[Nbasis]; int pmax = 6; - int nsubdivide = (pmax%2==0)?pmax+1:pmax+2; + int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2; size_t ncopy = (pmax+1)*(pmax+2)*(pmax+3)/6; From 1edf50ac53558d41767f4ef507aafbb7402f1302 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Wed, 2 Jun 2021 09:47:58 +0100 Subject: [PATCH 19/25] Changed value of fdim from 1 to 2 to save memory --- src/plugins/random_panphasia_ho.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index fcb3681..4cc3931 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -126,7 +126,7 @@ void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) int error; size_t x0 = 0, y0 = 0, z0 = 0; size_t rel_level; - int fdim = 1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid + int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; From ef8cc6d8633d193bc46ef66d92a991a7ae9b81c5 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Wed, 2 Jun 2021 15:30:19 +0200 Subject: [PATCH 20/25] added fatal error when panphasia_ho is run and "DoFixing" is enabled --- src/plugins/random_panphasia_ho.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index 4cc3931..c9ca725 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -76,6 +76,12 @@ public: grid_res_ = pcf_->get_value("setup", "GridRes"); boxlength_ = pcf_->get_value("setup", "BoxLength"); + if( pcf_->get_value("setup", "DoFixing") ){ + music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n"; + music::flog << "With the panphasia_ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n"; + throw std::runtime_error("PANPHASIA_HO: incompatible parameter."); + } + panphasia_mode_ = 0; PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); From af2c77fcd81d05fddadc51d66d5be6659cd8bae3 Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Thu, 17 Jun 2021 14:47:31 +0100 Subject: [PATCH 21/25] Bug fix: Nyquist modes set to zero for case where the mode power would otherwise be set to the mean power, instead of NaN --- external/panphasia_ho/pan_mpi_routines.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/external/panphasia_ho/pan_mpi_routines.c b/external/panphasia_ho/pan_mpi_routines.c index 7722c9e..40c3f0c 100644 --- a/external/panphasia_ho/pan_mpi_routines.c +++ b/external/panphasia_ho/pan_mpi_routines.c @@ -441,13 +441,15 @@ int m; ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; ksquared = kx*kx + ky*ky + kz*kz; + if ( (kx!=nfft_dim/2)&&(ky!=nfft_dim/2)&&(kz!=nfft_dim/2)){ //Omit Nyquist modes + if ((ksquared<=descriptor_kk_limit)&&(ksquared!=0)){ index1 = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; weight = cabs(return_field[index1]); return_field[index1] /= weight; }; }; - + }; }; //printf("Reached here 12!\n"); From e3f5d65e0aac5e92678d5e3673ea5b74f8a4984a Mon Sep 17 00:00:00 2001 From: Adrian Jenkins Date: Mon, 5 Jul 2021 08:50:17 +0100 Subject: [PATCH 22/25] Bug fix: non-integer box lenghts - thanks Willem Elbers --- src/plugins/random_panphasia_ho.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc index c9ca725..110a460 100644 --- a/src/plugins/random_panphasia_ho.cc +++ b/src/plugins/random_panphasia_ho.cc @@ -74,7 +74,7 @@ public: descriptor_string_ = pcf_->get_value("random", "descriptor"); grid_res_ = pcf_->get_value("setup", "GridRes"); - boxlength_ = pcf_->get_value("setup", "BoxLength"); + boxlength_ = pcf_->get_value("setup", "BoxLength"); if( pcf_->get_value("setup", "DoFixing") ){ music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n"; From d37cfc79b69db6e58d6ecec8142bc5d081ed01f6 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Sun, 8 Aug 2021 12:03:39 +0200 Subject: [PATCH 23/25] cleaned up PANPHASIA plugin so that now there is only one PANPHASIA and it selects automatically if it is 'ho' or v1 --- CMakeLists.txt | 15 +- example.conf | 2 +- include/HDF_IO.hh | 14 +- src/plugins/random_panphasia.cc | 570 ++++++----------------------- src/plugins/random_panphasia_ho.cc | 189 ---------- 5 files changed, 118 insertions(+), 672 deletions(-) delete mode 100644 src/plugins/random_panphasia_ho.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index f0c801f..849429b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,7 +150,7 @@ endif() endif(ENABLE_PANPHASIA) ######################################################################################################################## # PANPHASIA HO (High-Order, new version) -option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON) +# option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON) ######################################################################################################################## # INCLUDES include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/external) @@ -173,12 +173,7 @@ if(ENABLE_PANPHASIA) list (APPEND SOURCES ${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f ${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90 -) -endif() - -if(ENABLE_PANPHASIA_HO) -list (APPEND SOURCES - ${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c + #${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c ${PROJECT_SOURCE_DIR}/external/panphasia_ho/high_order_panphasia_routines.c ${PROJECT_SOURCE_DIR}/external/panphasia_ho/pan_mpi_routines.c ${PROJECT_SOURCE_DIR}/external/panphasia_ho/uniform_rand_threefry4x64.c @@ -259,9 +254,9 @@ if(ENABLE_PANPHASIA) target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA") endif(ENABLE_PANPHASIA) -if(ENABLE_PANPHASIA_HO) - target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO") -endif(ENABLE_PANPHASIA_HO) +# if(ENABLE_PANPHASIA_HO) +# target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO") +# endif(ENABLE_PANPHASIA_HO) if(ENABLE_PLT) target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT") diff --git a/example.conf b/example.conf index 0f124a2..aff2550 100644 --- a/example.conf +++ b/example.conf @@ -16,7 +16,7 @@ LPTorder = 3 # order of the LPT to be used (1,2 or 3) DoBaryons = no # also do baryon ICs? DoBaryonVrel = no # if doing baryons, incl. also relative velocity to linear order? -DoFixing = yes # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253) +DoFixing = no # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253) DoInversion = no # invert phases (for paired simulations) ParticleLoad = sc # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x) diff --git a/include/HDF_IO.hh b/include/HDF_IO.hh index 1b15b34..5e7215b 100755 --- a/include/HDF_IO.hh +++ b/include/HDF_IO.hh @@ -136,15 +136,13 @@ inline void HDFGetDatasetExtent( const std::string Filename, const std::string O int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID ); - hsize_t *dimsize = new hsize_t[ndims]; + std::vector dimsize(ndims,0); - H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL ); + H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL ); Extent.clear(); for(int i=0; i dimsize(ndims,0); + H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL ); hsize_t block[2]; block[0] = ii.size(); @@ -601,9 +599,9 @@ inline void HDFReadGroupAttribute( const std::string Filename, const std::string int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID ); - hsize_t dimsize[ndims]; + std::vector dimsize(ndims,0); - H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL ); + H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL ); HDF_StorageSize = 1; for(int i=0; i. // @@ -39,108 +39,16 @@ #include -const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim; -typedef int rand_offset_[5]; -typedef struct +namespace PANPHASIA2 { - int state[133]; // Nstore = Nstate (=5) + Nbatch (=128) - int need_fill; - int pos; -} rand_state_; - -/* pan_state_ struct -- corresponds to respective fortran module in panphasia_routines.f - * data structure that contains all panphasia state variables - * it needs to get passed between the fortran routines to enable - * thread-safe execution. - */ -typedef struct -{ - int base_state[5], base_lev_start[5][maxdim + 1]; - rand_offset_ poweroffset[maxpow + 1], superjump; - rand_state_ current_state[maxpow + 2]; - - int layer_min, layer_max, indep_field; - - long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2]; - int lev_common, layer_min_store, layer_max_store; - long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store, - iy_rel_store, iz_rel_store; - double exp_coeffs[8][8][maxdim + 2]; - long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1]; - int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2]; - - double cell_data[9][8]; - int ixh_last, iyh_last, izh_last; - int init; - - int init_cell_props; - int init_lecuyer_state; - long long p_xcursor[62], p_ycursor[62], p_zcursor[62]; - -} pan_state_; - -extern "C" -{ - void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose); - - void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1, - int16_t *side2, int16_t *side3, int32_t *check_int, char *name); - - void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop); - - void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min, - int *layer_max, int *indep_field, double *cell_prop); - - void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel, - long long *iy_rel, long long *iz_rel, int *VERBOSE); -} - -struct panphasia_descriptor -{ - int16_t wn_level_base; - int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base; - int16_t i_base, i_base_y, i_base_z; - int32_t check_rand; - std::string name; - - explicit panphasia_descriptor(std::string dstring) + extern "C" { - char tmp[100]; - std::memset(tmp, ' ', 100); - parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base, - &i_base_y, &i_base_z, &check_rand, tmp); - for (int i = 0; i < 100; i++) - if (tmp[i] == ' ') - { - tmp[i] = '\0'; - break; - } - name = tmp; - name.erase(std::remove(name.begin(), name.end(), ' '), name.end()); +#include + extern size_t descriptor_base_size; } -}; - -// greatest common divisor -int gcd(int a, int b) -{ - if (b == 0) - return a; - return gcd(b, a % b); } -// least common multiple -int lcm(int a, int b) { return abs(a * b) / gcd(a, b); } - -// Two or largest power of 2 less than the argument -int largest_power_two_lte(int b) -{ - int a = 1; - if (b <= a) - return a; - while (2 * a < b) - a = 2 * a; - return a; -} +#include "PANPHASIA.hh" class RNG_panphasia : public RNG_plugin { @@ -148,63 +56,15 @@ private: protected: std::string descriptor_string_; int num_threads_; - int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_; - bool incongruent_fields_; - double inter_grid_phase_adjustment_; - // double translation_phase_; - pan_state_ *lstate; - int grid_p_; - int coordinate_system_shift_[3]; - int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_; + int panphasia_mode_; + size_t grid_res_; + real_t boxlength_; - void clear_panphasia_thread_states(void) - { - for (int i = 0; i < num_threads_; ++i) - { - lstate[i].init = 0; - lstate[i].init_cell_props = 0; - lstate[i].init_lecuyer_state = 0; - } - } - - void initialize_for_grid_structure(void) - { - // if ngrid is not a multiple of i_base, then we need to enlarge and then sample down - ngrid_ = pcf_->get_value("setup", "GridRes"); - int ngridminsize_panphasia = pcf_->get_value_safe("random", "PanphasiaMinRootResolution",512); - - grid_p_ = pdescriptor_->i_base; - - lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0); - // lmin - - ngrid_panphasia_ = (1 << lextra_) * grid_p_; - - while( ngrid_panphasia_ < ngridminsize_panphasia ){ - lextra_++; - ngrid_panphasia_*=2; - } - assert( ngrid_panphasia_ >= ngridminsize_panphasia); - - - clear_panphasia_thread_states(); - - music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)",ngrid_panphasia_, lextra_); - if (ngridminsize_panphasia<512) - music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512",ngridminsize_panphasia); - music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_ ); - - coordinate_system_shift_[0] = -pcf_->get_value_safe("setup", "shift_x", 0); - coordinate_system_shift_[1] = -pcf_->get_value_safe("setup", "shift_y", 0); - coordinate_system_shift_[2] = -pcf_->get_value_safe("setup", "shift_z", 0); - } - - std::unique_ptr pdescriptor_; + PANPHASIA1::RNG *ppan1_rng_; public: explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf) { - descriptor_string_ = pcf_->get_value("random", "descriptor"); #ifdef _OPENMP num_threads_ = omp_get_max_threads(); @@ -212,336 +72,118 @@ public: num_threads_ = 1; #endif - // create independent state descriptions for each thread - lstate = new pan_state_[num_threads_]; + descriptor_string_ = pcf_->get_value("random", "descriptor"); + grid_res_ = pcf_->get_value("setup", "GridRes"); + boxlength_ = pcf_->get_value("setup", "BoxLength"); - // parse the descriptor for its properties - pdescriptor_ = std::make_unique(descriptor_string_); + if( pcf_->get_value("setup", "DoFixing") ){ + music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n"; + music::flog << "With the new panphasia ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n"; + throw std::runtime_error("PANPHASIA: incompatible parameter."); + } - music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base); + panphasia_mode_ = 0; + PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); - // write panphasia base size into config file for the grid construction - // as the gridding unit we use the least common multiple of 2 and i_base - std::stringstream ss; - //ARJ ss << lcm(2, pdescriptor_->i_base); - //ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ - ss << 2; //ARJ - set gridding unit to two - pcf_->insert_value("setup", "gridding_unit", ss.str()); - ss.str(std::string()); - ss << pdescriptor_->i_base; - pcf_->insert_value("random", "base_unit", ss.str()); - - this->initialize_for_grid_structure(); + if (panphasia_mode_ == 0) + { + ppan1_rng_ = new PANPHASIA1::RNG(&cf); + } } - ~RNG_panphasia() { delete[] lstate; } + ~RNG_panphasia() + { + if (panphasia_mode_ == 0) + { + delete ppan1_rng_; + } + } bool isMultiscale() const { return true; } + void Run_Panphasia_Highorder(Grid_FFT &g); + void Fill_Grid(Grid_FFT &g) { - auto sinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; }; - auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; }; - const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)}; - - // we will overwrite 'g', we can deallocate it while we prepare the panphasia field - g.reset(); - - clear_panphasia_thread_states(); - - // temporaries - Grid_FFT g0({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_); - Grid_FFT g1({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_); - Grid_FFT g2({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_); - Grid_FFT g3({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_); - Grid_FFT g4({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_); - - double t1 = get_wtime(); - // double tp = t1; - -#pragma omp parallel + switch (panphasia_mode_) { -#ifdef _OPENMP - const int mythread = omp_get_thread_num(); -#else - const int mythread = 0; -#endif - //int odd_x, odd_y, odd_z; - //int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level + case 0: // old mode + music::ilog << "PANPHASIA: Old descriptor" << std::endl; + ppan1_rng_->Fill(g); + break; - int verbosity = (mythread == 0); - char descriptor[100]; - std::memset(descriptor, 0, 100); - std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size()); + case 1: // PANPHASIA HO descriptor + music::ilog << "PANPHASIA: New descriptor" << std::endl; + this->Run_Panphasia_Highorder(g); + break; - start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity); - - { - panphasia_descriptor d(descriptor_string_); - - int level_p = d.wn_level_base + lextra_; - - lstate[mythread].layer_min = 0; - lstate[mythread].layer_max = level_p; - lstate[mythread].indep_field = 1; - - long long ix_rel[3]; - ix_rel[0] = 0; //ileft_corner_p[0]; - ix_rel[1] = 0; //ileft_corner_p[1]; - ix_rel[2] = 0; //ileft_corner_p[2]; - - set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2], - &verbosity); - } - - if (verbosity) - t1 = get_wtime(); - - std::array cell_prop; - pan_state_ *ps = &lstate[mythread]; - -#pragma omp for //nowait - for (size_t i = 0; i < g0.size(0); i += 2) - { - const int ixmax(std::min(2,g0.size(0)-i)); - for (size_t j = 0; j < g0.size(1); j += 2) - { - const int iymax(std::min(2,g0.size(1)-j)); - for (size_t k = 0; k < g0.size(2); k += 2) - { - const int izmax(std::min(2,g0.size(2)-k)); - - // ARJ - added inner set of loops to speed up evaluation of Panphasia - for (int ix = 0; ix < ixmax; ++ix) - { - for (int iy = 0; iy < iymax; ++iy) - { - for (int iz = 0; iz < izmax; ++iz) - { - int ilocal = i + ix; - int jlocal = j + iy; - int klocal = k + iz; - - int iglobal = ilocal + g0.local_0_start_; - int jglobal = jlocal; - int kglobal = klocal; - - adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min, - &ps->layer_max, &ps->indep_field, &cell_prop[0]); - - g0.relem(ilocal, jlocal, klocal) = cell_prop[0]; - g1.relem(ilocal, jlocal, klocal) = cell_prop[4]; - g2.relem(ilocal, jlocal, klocal) = cell_prop[2]; - g3.relem(ilocal, jlocal, klocal) = cell_prop[1]; - g4.relem(ilocal, jlocal, klocal) = cell_prop[8]; - } - } - } - } - } - } - } // end omp parallel region - - g0.FourierTransformForward(); - g1.FourierTransformForward(); - g2.FourierTransformForward(); - g3.FourierTransformForward(); - g4.FourierTransformForward(); - -#pragma omp parallel for - for (size_t i = 0; i < g0.size(0); i++) - { - for (size_t j = 0; j < g0.size(1); j++) - { - for (size_t k = 0; k < g0.size(2); k++) - { - if (!g0.is_nyquist_mode(i, j, k)) - { - auto kvec = g0.get_k(i, j, k); - - auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0]; - auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1]; - auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2]; - - auto fx = real_t(sinc(argx)); - auto gx = ccomplex_t(0.0, dsinc(argx)); - auto fy = real_t(sinc(argy)); - auto gy = ccomplex_t(0.0, dsinc(argy)); - auto fz = real_t(sinc(argz)); - auto gz = ccomplex_t(0.0, dsinc(argz)); - - auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz); - auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp))); - - auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k)); - - g0.kelem(i, j, k) = y0 * fx * fy * fz - + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) - + y4 * magnitude; - } - else - { - g0.kelem(i, j, k) = 0.0; - } - } - } + default: // unknown PANPHASIA mode + music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl; + abort(); + break; } - - // music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp); - // tp = get_wtime(); - - g1.FourierTransformBackward(false); - g2.FourierTransformBackward(false); - g3.FourierTransformBackward(false); - g4.FourierTransformBackward(false); - -#pragma omp parallel - { -#ifdef _OPENMP - const int mythread = omp_get_thread_num(); -#else - const int mythread = 0; -#endif - - // int odd_x, odd_y, odd_z; - int verbosity = (mythread == 0); - char descriptor[100]; - std::memset(descriptor, 0, 100); - std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size()); - - start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity); - - { - panphasia_descriptor d(descriptor_string_); - - int level_p = d.wn_level_base + lextra_; - - lstate[mythread].layer_min = 0; - lstate[mythread].layer_max = level_p; - lstate[mythread].indep_field = 1; - - long long ix_rel[3]; - ix_rel[0] = 0; //ileft_corner_p[0]; - ix_rel[1] = 0; //ileft_corner_p[1]; - ix_rel[2] = 0; //ileft_corner_p[2]; - - set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2], - &verbosity); - } - - if (verbosity) - t1 = get_wtime(); - - //*************************************************************** - // Process Panphasia values: p110, p011, p101, p111 - //**************************************************************** - std::array cell_prop; - pan_state_ *ps = &lstate[mythread]; - -#pragma omp for //nowait - for (size_t i = 0; i < g1.size(0); i += 2) - { - const int ixmax(std::min(2,g1.size(0)-i)); - for (size_t j = 0; j < g1.size(1); j += 2) - { - const int iymax(std::min(2,g1.size(1)-j)); - for (size_t k = 0; k < g1.size(2); k += 2) - { - const int izmax(std::min(2,g1.size(2)-k)); - - // ARJ - added inner set of loops to speed up evaluation of Panphasia - for (int ix = 0; ix < ixmax; ++ix) - { - for (int iy = 0; iy < iymax; ++iy) - { - for (int iz = 0; iz < izmax; ++iz) - { - int ilocal = i + ix; - int jlocal = j + iy; - int klocal = k + iz; - - int iglobal = ilocal + g1.local_0_start_; - int jglobal = jlocal; - int kglobal = klocal; - - adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min, - &ps->layer_max, &ps->indep_field, &cell_prop[0]); - - g1.relem(ilocal, jlocal, klocal) = cell_prop[6]; - g2.relem(ilocal, jlocal, klocal) = cell_prop[3]; - g3.relem(ilocal, jlocal, klocal) = cell_prop[5]; - g4.relem(ilocal, jlocal, klocal) = cell_prop[7]; - } - } - } - } - } - } - } // end omp parallel region - - // music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp); - // tp = get_wtime(); - - ///////////////////////////////////////////////////////////////////////// - // transform and convolve with Legendres - g1.FourierTransformForward(); - g2.FourierTransformForward(); - g3.FourierTransformForward(); - g4.FourierTransformForward(); - - #pragma omp parallel for - for (size_t i = 0; i < g0.size(0); i++) - { - for (size_t j = 0; j < g0.size(1); j++) - { - for (size_t k = 0; k < g0.size(2); k++) - { - if (!g0.is_nyquist_mode(i, j, k)) - { - auto kvec = g0.get_k(i, j, k); - - auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0]; - auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1]; - auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2]; - - auto fx = real_t(sinc(argx)); - auto gx = ccomplex_t(0.0, dsinc(argx)); - auto fy = real_t(sinc(argy)); - auto gy = ccomplex_t(0.0, dsinc(argy)); - auto fz = real_t(sinc(argz)); - auto gz = ccomplex_t(0.0, dsinc(argz)); - - auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k)); - - g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz; - - // do final phase shift to account for corner centered coordinates vs. cell centers - auto phase_shift = - 0.5 * M_PI * ( kvec[0] / g0.kny_[0] - + kvec[1] /g0.kny_[1] + kvec[2] / g0.kny_[2]); - - g0.kelem(i, j, k) *= std::exp( ccomplex_t(0,phase_shift) ); - } - } - } - } - - g1.reset(); - g2.reset(); - g3.reset(); - g4.reset(); - - g.allocate(); - g0.FourierInterpolateCopyTo( g ); - - music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1, - 1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2)); - music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std()); } }; +void RNG_panphasia::Run_Panphasia_Highorder(Grid_FFT &g) +{ + int verbose = 0; + int error; + size_t x0 = 0, y0 = 0, z0 = 0; + size_t rel_level; + int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid + + //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; + + PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); + + printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); + + // Choose smallest value of level to equal of exceed grid_res_) + + for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size < pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}}); + + assert(pan_grid.n_[0] == N0); + assert(pan_grid.n_[1] == N0); + assert(pan_grid.n_[2] == N0); + assert(pan_grid.local_0_start_ == local_0_start); + assert(pan_grid.local_0_size_ == local_n0); + assert(pan_grid.ntot_ == size_t(alloc_local)); + + pan_grid.FourierTransformForward(false); + + FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast(&pan_grid.data_[0]); + + // Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); + + if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) + { + music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl; + }; + + pan_grid.FourierInterpolateCopyTo(g); +} namespace { RNG_plugin_creator_concrete creator("PANPHASIA"); } -#endif // defined(USE_PANPHASIA) \ No newline at end of file +#endif // defined(USE_PANPHASIA) diff --git a/src/plugins/random_panphasia_ho.cc b/src/plugins/random_panphasia_ho.cc deleted file mode 100644 index 110a460..0000000 --- a/src/plugins/random_panphasia_ho.cc +++ /dev/null @@ -1,189 +0,0 @@ -// This file is part of monofonIC (MUSIC2) -// A software package to generate ICs for cosmological simulations -// Copyright (C) 2021 by Oliver Hahn and Adrian Jenkins (this file) -// but see distinct licensing for PANPHASIA below -// -// monofonIC is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// monofonIC is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// -// IMPORTANT NOTICE: -// Note that PANPHASIA itself is not released under the GPL. Make sure -// to read and agree to its distinct licensing before you use or modify -// the code below or in the /external/panphasia directory which can be -// found here: http://icc.dur.ac.uk/Panphasia.php -// NOTE THAT PANPHASIA REQUIRES REGISTRATION ON THIS WEBSITE PRIOR TO USE - -#if defined(USE_PANPHASIA_HO) - -#include -#include -#include - -#include -#include -#include - -#ifdef _OPENMP -#include -#endif - -#include - -namespace PANPHASIA2 -{ - extern "C" - { -#include - extern size_t descriptor_base_size; - } -} - -#include "PANPHASIA.hh" - -class RNG_panphasia_ho : public RNG_plugin -{ -private: -protected: - std::string descriptor_string_; - int num_threads_; - int panphasia_mode_; - size_t grid_res_; - real_t boxlength_; - - PANPHASIA1::RNG *ppan1_rng_; - -public: - explicit RNG_panphasia_ho(config_file &cf) : RNG_plugin(cf) - { - -#ifdef _OPENMP - num_threads_ = omp_get_max_threads(); -#else - num_threads_ = 1; -#endif - - descriptor_string_ = pcf_->get_value("random", "descriptor"); - grid_res_ = pcf_->get_value("setup", "GridRes"); - boxlength_ = pcf_->get_value("setup", "BoxLength"); - - if( pcf_->get_value("setup", "DoFixing") ){ - music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n"; - music::flog << "With the panphasia_ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n"; - throw std::runtime_error("PANPHASIA_HO: incompatible parameter."); - } - - panphasia_mode_ = 0; - PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_); - - if (panphasia_mode_ == 0) - { - ppan1_rng_ = new PANPHASIA1::RNG(&cf); - } - } - - ~RNG_panphasia_ho() - { - if (panphasia_mode_ == 0) - { - delete ppan1_rng_; - } - } - - bool isMultiscale() const { return true; } - - void Run_Panphasia_Highorder(Grid_FFT &g); - - void Fill_Grid(Grid_FFT &g) - { - switch (panphasia_mode_) - { - - case 0: // old mode - music::ilog << "PANPHASIA: Old descriptor" << std::endl; - ppan1_rng_->Fill(g); - break; - - case 1: // PANPHASIA HO descriptor - music::ilog << "PANPHASIA: New descriptor" << std::endl; - this->Run_Panphasia_Highorder(g); - break; - - default: // unknown PANPHASIA mode - music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl; - abort(); - break; - } - } -}; - -void RNG_panphasia_ho::Run_Panphasia_Highorder(Grid_FFT &g) -{ - int verbose = 0; - int error; - size_t x0 = 0, y0 = 0, z0 = 0; - size_t rel_level; - int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid - - //char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]"; - - PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose); - - printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_); - - // Choose smallest value of level to equal of exceed grid_res_) - - for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size < pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}}); - - assert(pan_grid.n_[0] == N0); - assert(pan_grid.n_[1] == N0); - assert(pan_grid.n_[2] == N0); - assert(pan_grid.local_0_start_ == local_0_start); - assert(pan_grid.local_0_size_ == local_n0); - assert(pan_grid.ntot_ == size_t(alloc_local)); - - pan_grid.FourierTransformForward(false); - - FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast(&pan_grid.data_[0]); - - // Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local); - - if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))) - { - music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl; - }; - - pan_grid.FourierInterpolateCopyTo(g); -} -namespace -{ - RNG_plugin_creator_concrete creator("PANPHASIA_HO"); -} -#endif // defined(USE_PANPHASIA_HO) From ffcc03831f24a6653b590d2d310063c2ddc1a841 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Wed, 1 Sep 2021 21:30:34 +0200 Subject: [PATCH 24/25] cleanup of Adrian's PANPHASIA_ho code to avoid compiler warnings in pedantic mode --- external/panphasia_ho/LICENSE | 35 + .../high_order_panphasia_routines.c | 230 +-- external/panphasia_ho/pan_mpi_routines.c | 806 ++++----- external/panphasia_ho/panphasia_functions.h | 2 +- .../panphasia_ho/uniform_rand_threefry4x64.c | 1586 ++++++++++------- src/plugins/random_panphasia.cc | 2 + 6 files changed, 1466 insertions(+), 1195 deletions(-) create mode 100644 external/panphasia_ho/LICENSE diff --git a/external/panphasia_ho/LICENSE b/external/panphasia_ho/LICENSE new file mode 100644 index 0000000..27a5ff9 --- /dev/null +++ b/external/panphasia_ho/LICENSE @@ -0,0 +1,35 @@ +The code in this subdirectory is part of Adrian Jenkins' PANPHASIA packet, +obtained from here http://icc.dur.ac.uk/Panphasia.php + +PANPHASIA is not published under the GPL but has its own proprietary license, +make sure to visit the website before using the PANPHASIA functionality of +MUSIC2 and register your name. + +We reproduce the licensing requirements for PANPHASIA from the above website +as retrieved on 2020/08/23: + +We make our software available for free but with a licence that includes the +condition that users make sure the phases of any new simulation volumes set up +using Panphasia are published. + +We are happy to collaborate with others on improving the software and providing +support for languages other than fortran. Contact: A.R.Jenkins@durham.ac.uk + +LICENCE: + +You are licensed to use this software free of charge on condition that: + +- you will publish the phase descriptors and reference Jenkins (13) for any new + simulations that use Panphasia phases. You will pass on this condition to others + for any software or data you make available publically or privately that makes + use of Panphasia. +- that you will ensure any publications using results derived from Panphasia will + be submitted as a final version to arXiv prior to or coincident with publication + in a journal. +- that you report any bugs in this software as soon as confirmed to + A.R.Jenkins@durham.ac.uk +- that you understand that the software comes with no warranty and that is your + responsibility to ensure that it is suitable for the purpose that you intend. +- that you agree to having your name and email address stored for an indefinite + period in the future electronically in a database as a record that you agreed + the licence conditions. diff --git a/external/panphasia_ho/high_order_panphasia_routines.c b/external/panphasia_ho/high_order_panphasia_routines.c index 672c575..aab77bf 100644 --- a/external/panphasia_ho/high_order_panphasia_routines.c +++ b/external/panphasia_ho/high_order_panphasia_routines.c @@ -195,7 +195,7 @@ for (i=0; i<8*Nbasis; i++) output_vec_children[i] = work_vec2[i]; void box_muller_(PAN_REAL *unif_rand, PAN_REAL *gvar) { - int i, j, k, count; + int i, count; const PAN_REAL pi = 4.0 * atan(1.0); const PAN_REAL two_pi = 2.0 * pi; @@ -264,7 +264,7 @@ void speed_test2_() size_t N_cells = 1e6; PAN_REAL parent[Nbasis]; - PAN_REAL child[8 * Nbasis]; + // PAN_REAL child[8 * Nbasis]; PAN_REAL output[8 * Nbasis]; //ticks tick_start = getticks(); @@ -285,7 +285,7 @@ void speed_test2_() //tic_total = getticks() - tick_start; //printf("Computed %ld cells in time %.3f %s\n",N_cells,clocks_from_ticks(tic_total),clocks_getunit()); -}; +} //=================================================================================== void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3, @@ -326,7 +326,7 @@ void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size // // // }; -}; +} //================================================================================== void test_random_dist_(size_t ishift) @@ -369,8 +369,8 @@ void test_random_dist_(size_t ishift) char str1[100], str2[100]; - sprintf(str1, "Gaussian_random_distribution_%llu.dat", ishift); - sprintf(str2, "Log_uniform_random_distribution_%llu.dat", ishift); + sprintf(str1, "Gaussian_random_distribution_%lu.dat", ishift); + sprintf(str2, "Log_uniform_random_distribution_%lu.dat", ishift); FILE *file = fopen(str1, "w"); FILE *file2 = fopen(str2, "w"); @@ -378,7 +378,7 @@ void test_random_dist_(size_t ishift) for (size_t j3 = 0; j3 < NC; j3++) { if (j3 % 10000000 == 0) - printf("Looped over %lld\n", j3); + printf("Looped over %ld\n", j3); return_uniform_pseudo_rands_threefry4x64_(l, j1, j2, j3, unif_randoms, seed_value, allow_non_zero_seed_saftey_catch); @@ -402,9 +402,9 @@ void test_random_dist_(size_t ishift) rms_value = sqrt(sum_squares / (double)nrand); - printf("Number of rands %ld RMS = %12.10lg Deviation %lg \n", + printf("Number of rands %lld RMS = %12.10lg Deviation %lg \n", nrand, rms_value, (rms_value - 1.0) * sqrt((double)nrand)); - fprintf(file, "Number of rands %ld RMS = %12.10lg Deviation %lg \n", + fprintf(file, "Number of rands %lld RMS = %12.10lg Deviation %lg \n", nrand, rms_value, (rms_value - 1.0) * sqrt((double)nrand)); for (int i = 0; i < 100; i++) @@ -414,9 +414,9 @@ void test_random_dist_(size_t ishift) { g_expected = 0.5 * (erf(0.2 * sqrt(0.5) * ((PAN_REAL)(i - array_offset) + 0.5)) - erf(0.2 * sqrt(0.5) * ((PAN_REAL)(i - array_offset) - 0.5))) * (PAN_REAL)nrand; - printf("%d %ld %f %f \n", i - array_offset, gauss_dist[i], g_expected, (gauss_dist[i] - g_expected) / sqrt(gauss_dist[i])); + printf("%d %lld %f %f \n", i - array_offset, gauss_dist[i], g_expected, (gauss_dist[i] - g_expected) / sqrt(gauss_dist[i])); - fprintf(file, "%d %ld %f %f \n", i - array_offset, gauss_dist[i], g_expected, (gauss_dist[i] - g_expected) / sqrt(gauss_dist[i])); + fprintf(file, "%d %lld %f %f \n", i - array_offset, gauss_dist[i], g_expected, (gauss_dist[i] - g_expected) / sqrt(gauss_dist[i])); }; if (log_uniform_dist[i] != 0) { @@ -612,20 +612,20 @@ int demo_descriptor_() // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH2414478110,Auriga_volume2]"; // char str[200] = "[Panph6,L21,(1136930,890765,1847934),S3,CH-999,Auriga_volume2]"; - char copy[200]; - const char s[20] = "[,L,(),S,CH,]"; - char *token; + // char copy[200]; + // const char s[20] = "[,L,(),S,CH,]"; + // char *token; - size_t desc_level, desc_x, desc_y, desc_z, desc_size; - long long int desc_ch; - char desc_name[100]; - char desc_iden[8]; + // size_t desc_level, desc_x, desc_y, desc_z, desc_size; + // long long int desc_ch; + // char desc_name[100]; + // char desc_iden[8]; int error_code; int pan_mode; descriptor_read_in = 0; - if (error_code = parse_and_validate_descriptor_(str,&pan_mode)) + if ((error_code = parse_and_validate_descriptor_(str, &pan_mode))) { printf("Invalid descriptor %s\n", str); @@ -639,12 +639,12 @@ int demo_descriptor_() if (descriptor_read_in) { printf("-----------------------------------------\n"); - printf("Descriptor order: %llu\n", descriptor_order); - printf("Descriptor base level: %llu\n", descriptor_base_level); - printf("Descriptor x-origin: %llu\n", descriptor_xorigin); - printf("Descriptor y-origin: %llu\n", descriptor_yorigin); - printf("Descriptor z-origin: %llu\n", descriptor_zorigin); - printf("Descriptor base size: %llu\n", descriptor_base_size); + printf("Descriptor order: %lu\n", descriptor_order); + printf("Descriptor base level: %lu\n", descriptor_base_level); + printf("Descriptor x-origin: %lu\n", descriptor_xorigin); + printf("Descriptor y-origin: %lu\n", descriptor_yorigin); + printf("Descriptor z-origin: %lu\n", descriptor_zorigin); + printf("Descriptor base size: %lu\n", descriptor_base_size); printf("Descriptor check digit:%lld\n", descriptor_check_digit); printf("Descriptor name %s\n", descriptor_name); printf("-----------------------------------------\n"); @@ -674,8 +674,8 @@ int demo_descriptor_() verbose = 0; - if (error_code = PANPHASIA_init_level_(&rel_lev, - &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose)) + if ((error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose))) { printf("Error %d in initialing PANPHASIA_init_level_\n", error_code); @@ -704,9 +704,9 @@ int demo_descriptor_() for (int i = 0; i < Nbasis / 3; i++) copy_list[i] = 3 * i; - if (error_code = PANPHASIA_compute_coefficients_(&xstart, &ystart, &zstart, - &xextent, &yextent, &zextent, copy_list, &ncopy, - output_values, &flag_output_mode, &verbose)) + if ((error_code = PANPHASIA_compute_coefficients_(&xstart, &ystart, &zstart, + &xextent, &yextent, &zextent, copy_list, &ncopy, + output_values, &flag_output_mode, &verbose))) { printf("Error %d in PANPHASIA_compute_coefficients \n", error_code); @@ -721,13 +721,13 @@ int demo_descriptor_() for (size_t xco = 0; xco < xextent; xco++) for (size_t yco = 0; yco < yextent; yco++) for (size_t zco = 0; zco < zextent; zco++) - fprintf(file, "%llu %llu %llu %f\n", xco, yco, zco, output_values[ncopy * (xco * yextent * zextent + yco * zextent + zco)]); + fprintf(file, "%lu %lu %lu %f\n", xco, yco, zco, output_values[ncopy * (xco * yextent * zextent + yco * zextent + zco)]); fclose(file); }; return (0); -}; +} int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose) { @@ -758,7 +758,7 @@ int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose) check_panphasia_key_(verb); int pan_mode; - if (error = parse_and_validate_descriptor_(descriptor,&pan_mode)) + if ((error = parse_and_validate_descriptor_(descriptor, &pan_mode))) { printf("-----------------------------------------\n"); printf("Error initating start-up Panphasia routines \n"); @@ -771,7 +771,7 @@ int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose) if (*verbose) printf("Sucessfully started Panphasia with the descriptor:\n%s\n", descriptor); return (0); -}; +} ///////////////////////////////////////////////////////////////////////////////// @@ -838,7 +838,7 @@ void PANPHASIA_init_descriptor_checks() printf("===================================================\n"); panphasia_rel_origin_set = 0; // Force user to set rel origin themselves. -}; +} int PANPHASIA_init_level_(size_t *rel_lev, size_t *rel_orig_x, size_t *rel_orig_y, @@ -869,16 +869,16 @@ int PANPHASIA_init_level_(size_t *rel_lev, { printf("-----------------------------------------------------------------\n"); printf("Initialising a Panphasia subgrid\n"); - printf("Relative level %llu\n", rel_level); - printf("Relative origin (%llu,%llu,%llu)\n", rel_origin_x, rel_origin_y, rel_origin_z); - printf("The maximum possible extent of this subgrid is %llu cells\n", rel_coord_max); + printf("Relative level %lu\n", rel_level); + printf("Relative origin (%lu,%lu,%lu)\n", rel_origin_x, rel_origin_y, rel_origin_z); + printf("The maximum possible extent of this subgrid is %lu cells\n", rel_coord_max); printf("-----------------------------------------------------------------\n"); }; panphasia_rel_origin_set = 1; return (0); -}; +} //====================================================================================== //====================================================================================== @@ -910,9 +910,12 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta if (*zstart >= rel_coord_max) return (203); - if (*xextent > rel_coord_max) return (204); - if (*yextent > rel_coord_max) return (205); - if (*zextent > rel_coord_max) return (206); + if (*xextent > rel_coord_max) + return (204); + if (*yextent > rel_coord_max) + return (205); + if (*zextent > rel_coord_max) + return (206); if ((*ncopy < 0) || (*ncopy > Nbasis)) return (207); @@ -920,7 +923,8 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta if ((copy_list[0] < 0) || (copy_list[*ncopy - 1] >= Nbasis)) return (208); - if ((*xextent==0)||(*yextent==0)||(*zextent==0)) return(0); + if ((*xextent == 0) || (*yextent == 0) || (*zextent == 0)) + return (0); for (int i = 1; i < *ncopy; i++) if (copy_list[i] <= copy_list[i - 1]) @@ -1009,17 +1013,17 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta { int error_code; - if (error_code = return_binary_tree_cell_lists(level_max, list_cell_x_coord, - *xextent, ret_x_list_coords, nreturn_x, child_pointer_x, - level_count_x, level_begin_x, index_perm_x)) + if ((error_code = return_binary_tree_cell_lists(level_max, list_cell_x_coord, + *xextent, ret_x_list_coords, nreturn_x, child_pointer_x, + level_count_x, level_begin_x, index_perm_x))) return (error_code); - if (error_code = return_binary_tree_cell_lists(level_max, list_cell_y_coord, - *yextent, ret_y_list_coords, nreturn_y, child_pointer_y, - level_count_y, level_begin_y, index_perm_y)) + if ((error_code = return_binary_tree_cell_lists(level_max, list_cell_y_coord, + *yextent, ret_y_list_coords, nreturn_y, child_pointer_y, + level_count_y, level_begin_y, index_perm_y))) return (error_code); - if (error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, - *zextent, ret_z_list_coords, nreturn_z, child_pointer_z, - level_count_z, level_begin_z, index_perm_z)) + if ((error_code = return_binary_tree_cell_lists(level_max, list_cell_z_coord, + *zextent, ret_z_list_coords, nreturn_z, child_pointer_z, + level_count_z, level_begin_z, index_perm_z))) return (error_code); }; //=================================================================== @@ -1035,7 +1039,7 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta }; if (*verbose) - printf("Total number cells: %llu \n", number_of_cells); + printf("Total number cells: %lu \n", number_of_cells); cell_memory_to_allocate = sizeof(PAN_REAL) * number_of_cells * Nbasis; }; @@ -1047,13 +1051,13 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta //======================================================================================== // Loop over octree starting at the root, for all relevant cells at each level //======================================================================================== - size_t total_number_cells = 0; - size_t num_cell_compute = 0; - size_t num_level_max_cells = 0; - size_t total_num_children = 0; + // size_t total_number_cells = 0; + // size_t num_cell_compute = 0; + // size_t num_level_max_cells = 0; + // size_t total_num_children = 0; { size_t cell_index, j1, j2, j3; - size_t child_cells[8]; + // size_t child_cells[8]; size_t xoffset, yoffset, zoffset; size_t ix, iy, iz; size_t xco, yco, zco; @@ -1135,7 +1139,7 @@ int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t *zsta }; // end loop over possible children if (*verbose > 1) - printf("Cell: L%llu %llu %llu %llu\n", level, j1, j2, j3); + printf("Cell: L%lu %lu %lu %lu\n", level, j1, j2, j3); }; // z/y/x-coordinate/level @@ -1262,31 +1266,31 @@ int parse_and_validate_descriptor_(const char *descriptor, int *pan_mode) switch (nelement) { case 1: - if (sscanf(token, "Panph%llu", &desc_order) != 1) + if (sscanf(token, "Panph%lu", &desc_order) != 1) return (440001); break; case 2: - if (sscanf(token, "L%llu", &desc_level) != 1) + if (sscanf(token, "L%lu", &desc_level) != 1) return 440002; break; case 3: - if (sscanf(token, "%llu", &desc_x) != 1) + if (sscanf(token, "%lu", &desc_x) != 1) return 440003; break; case 4: - if (sscanf(token, "%llu", &desc_y) != 1) + if (sscanf(token, "%lu", &desc_y) != 1) return 440004; break; case 5: - if (sscanf(token, "%llu", &desc_z) != 1) + if (sscanf(token, "%lu", &desc_z) != 1) return 440005; break; case 6: - if (sscanf(token, "S%llu", &desc_size) != 1) + if (sscanf(token, "S%lu", &desc_size) != 1) return 440005; break; case 7: - if (sscanf(token, "KK%lld", &desc_kk_limit) == 1) + if (sscanf(token, "KK%lu", &desc_kk_limit) == 1) { kk_limit_set = 1; token = strtok(NULL, split); @@ -1295,7 +1299,7 @@ int parse_and_validate_descriptor_(const char *descriptor, int *pan_mode) return 440006; break; case 8: - if (sscanf(token, "%s", &desc_name) != 1) + if (sscanf(token, "%199s", desc_name) != 1) return 440007; break; } @@ -1304,12 +1308,12 @@ int parse_and_validate_descriptor_(const char *descriptor, int *pan_mode) if (kk_limit_set == 0) { - sprintf(descriptor_as_read, "[Panph%llu,L%llu,(%llu,%llu,%llu),S%llu,CH%lld,%s]", + sprintf(descriptor_as_read, "[Panph%lu,L%lu,(%lu,%lu,%lu),S%lu,CH%lld,%s]", desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_ch, desc_name); } else { - sprintf(descriptor_as_read, "[Panph%llu,L%llu,(%llu,%llu,%llu),S%llu,KK%lld,CH%lld,%s]", + sprintf(descriptor_as_read, "[Panph%lu,L%lu,(%lu,%lu,%lu),S%lu,KK%ld,CH%lld,%s]", desc_order, desc_level, desc_x, desc_y, desc_z, desc_size, desc_kk_limit, desc_ch, desc_name); } @@ -1334,7 +1338,7 @@ int parse_and_validate_descriptor_(const char *descriptor, int *pan_mode) strcpy(full_descriptor, descriptor); descriptor_read_in = 1; - *pan_mode = (desc_order==1)? 0:1; // 0 - Old descriptor: 1 HO descriptor + *pan_mode = (desc_order == 1) ? 0 : 1; // 0 - Old descriptor: 1 HO descriptor comp_ch = compute_check_digit_(); // check the check digit @@ -1359,7 +1363,7 @@ void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel, size_t *xa // printf("descriptor_zorigin %llu rel_level %llu zrel %llu rel_origin_z %llu rel_coord_max %llu \n descriptor_base_level %llu, zabs %llu\n", //y descriptor_zorigin,rel_level,zrel,rel_origin_z,rel_coord_max,descriptor_base_level,*zabs); -}; +} int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen, size_t *cuboid_y_dimen, size_t *cuboid_z_dimen, size_t *cell_lev, @@ -1452,44 +1456,44 @@ int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinate return (0); } - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - // - // Test code for checking the appropriate moments are preserved - // between levels in Panphasia - // - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +// +// Test code for checking the appropriate moments are preserved +// between levels in Panphasia +// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////// #include void integrate_cell(int, int, int, size_t, size_t, size_t, FFTW_REAL *, double *); int compute_panphasia_(double, double, double, size_t, size_t, size_t, FFTW_REAL *, double *); -void test_cell_moments(char *, size_t, size_t, size_t, size_t, size_t, double *); +void test_cell_moments(const char *, size_t, size_t, size_t, size_t, size_t, double *); //////////////////////////////////////////////////////////////////////////////// void test_moments_() { - int lev = 10; + // int lev = 10; char descriptor_demo[300] = "Hello!"; printf("Demo string %s\n", descriptor_demo); // descriptor_pair_generate_();//, descriptor_demo); printf("Parameters: %s\n", descriptor_demo); - size_t nlevel = 1; + // size_t nlevel = 1; double coefficients1[Nbasis]; double coefficients2[Nbasis]; @@ -1515,7 +1519,7 @@ void test_moments_() yco = (yco_full) >> (63 - level); zco = (zco_full) >> (63 - level); - sprintf(descriptor, "[Panph6,L%ld,(%llu,%llu,%llu),S1,CH-999,test]", level, xco, yco, zco); + sprintf(descriptor, "[Panph6,L%ld,(%lu,%lu,%lu),S1,CH-999,test]", level, xco, yco, zco); // printf("%s\n",descriptor); test_cell_moments(descriptor, 0, 0, 0, 0, 1, coefficients1); @@ -1560,7 +1564,7 @@ void test_moments_() printf("Completed moment test successfully.\n"); } -void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_orig_x, +void test_cell_moments(const char *root_descriptor, size_t rel_lev, size_t rel_orig_x, size_t rel_orig_y, size_t rel_orig_z, size_t extent, double *coeff) { @@ -1572,8 +1576,8 @@ void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_ori verbose = 0; - if (error_code = PANPHASIA_init_level_(&rel_lev, - &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose)) + if ((error_code = PANPHASIA_init_level_(&rel_lev, + &rel_orig_x, &rel_orig_y, &rel_orig_z, &verbose))) { printf("Error %d in initialing PANPHASIA_init_level_\n", error_code); @@ -1598,9 +1602,9 @@ void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_ori abort(); } - if (error_code = PANPHASIA_compute_coefficients_(&xstart, &ystart, &zstart, - &xextent, &yextent, &zextent, copy_list, &ncopy, - output_values, &flag_output_mode, &verbose)) + if ((error_code = PANPHASIA_compute_coefficients_(&xstart, &ystart, &zstart, + &xextent, &yextent, &zextent, copy_list, &ncopy, + output_values, &flag_output_mode, &verbose))) { printf("Error %d in PANPHASIA_compute_coefficients_ \n", error_code); @@ -1622,7 +1626,8 @@ void test_cell_moments(char root_descriptor[200], size_t rel_lev, size_t rel_ori */ double sum_coefficients[Nbasis]; - for( size_t i=0; i #endif - extern const int Nbasis; extern const int irank_p[3][84]; @@ -22,556 +21,513 @@ extern size_t descriptor_kk_limit; extern size_t descriptor_base_size; int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_fourier_grid, - ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return, - FFTW_COMPLEX *return_field) + ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return, + FFTW_COMPLEX *return_field) { -size_t copy_list[Nbasis]; + size_t copy_list[Nbasis]; + int pmax = 6; + int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2; + size_t ncopy = (pmax + 1) * (pmax + 2) * (pmax + 3) / 6; -int pmax = 6; + if (ncopy % nsubdivide != 0) + return (100010); + int nchunk = ncopy / nsubdivide; - int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2; + int verbose = 1; + int flag_output_mode = 2; + int error; + ptrdiff_t size_to_alloc_fourier; + ptrdiff_t size_to_alloc_pan; + ptrdiff_t local_fourier_x_start, local_fourier_x_end; + FFTW_PLAN output_coeff_forward_plan; -size_t ncopy = (pmax+1)*(pmax+2)*(pmax+3)/6; + ptrdiff_t N0_pan_grid = descriptor_base_size << relative_level; -if (ncopy%nsubdivide!=0) return(100010); -int nchunk = ncopy/nsubdivide; + if (N0_fourier_grid % N0_pan_grid != 0) + return (100015); -int verbose = 1; -int flag_output_mode=2; -int error; -ptrdiff_t size_to_alloc_fourier; -ptrdiff_t size_to_alloc_pan; - ptrdiff_t local_fourier_x_start, local_fourier_x_end; -FFTW_PLAN output_coeff_forward_plan; + int fdim = N0_fourier_grid / N0_pan_grid; + size_t nfft_dim = N0_fourier_grid; + size_t npan_dim = N0_pan_grid; -ptrdiff_t N0_pan_grid = descriptor_base_size< descriptor_order) + return (100020); -int SHARED_FOUR_PAN_SPACE = (nsubdivide==1)&&(fdim==1)&&(sizeof(PAN_REAL)==sizeof(FFTW_REAL)); + for (size_t i = 0; i < Nbasis; i++) + copy_list[i] = i; + //printf("Dimensions of FT (%td,%td,%td)\n",N0_fourier_grid,N0_fourier_grid,N0_fourier_grid); + //printf("Dimensions of PG (%td,%td,%td)\n",N0_pan_grid,N0_pan_grid,N0_pan_grid); + //printf("local_no %td local_0_start_fourier %td\n",local_n0_fourier_return, local_0_start_fourier_return); + // Compute 1-D Spherical Bessel coefficients for each order ////////////////// + // These are needed for the convolutions below ////////////////// + size_t n4dimen; -//////////////////////////////////////////////////////////////////////////////////// + n4dimen = (nfft_dim % 4 == 0) ? 4 * (nfft_dim / 4) + 4 : 4 * (nfft_dim / 4) + 5; -if (pmax>descriptor_order) return(100020); + double complex *sph_bessel_coeff = FFTW_MALLOC(sizeof(double complex) * n4dimen * (pmax + 1)); -for (size_t i=0; infft_dim/2) ? - ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; - ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; - kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; + for (int ix = 0; ix < local_n0_fourier_return; ix++) + for (int iy = 0; iy < nfft_dim; iy++) + for (int iz = 0; iz <= nfft_dim / 2; iz++) + { + index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz; + kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; + ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy; + kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz; - if ( (kx==nfft_dim/2)||(ky==nfft_dim/2)||(kz==nfft_dim/2)){ - // Set Nyquist modes to zero - not used by IC_Gen anyway. - phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift - ptr_mode_weightings[index1] = 0.0; // Set squared weight to zero - }else{ - phase_shift_and_scale = sqrt( (double)(fdim*fdim*fdim))* - cexp( (double)fdim * (-I)*pi*(double)(kx + ky + kz)/sqrt(ptr_mode_weightings[index1])/ - (double)nfft_dim)/pow((double)nfft_dim,1.5); + if ((kx == nfft_dim / 2) || (ky == nfft_dim / 2) || (kz == nfft_dim / 2)) + { + // Set Nyquist modes to zero - not used by IC_Gen anyway. + phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift + ptr_mode_weightings[index1] = 0.0; // Set squared weight to zero + } + else + { + phase_shift_and_scale = sqrt((double)(fdim * fdim * fdim)) * + cexp((double)fdim * (-I) * pi * (double)(kx + ky + kz) / sqrt(ptr_mode_weightings[index1]) / + (double)nfft_dim) / + pow((double)nfft_dim, 1.5); + }; + + return_field[index1] *= phase_shift_and_scale; + if (ptr_mode_weightings[index1] < min_weight) + min_weight = ptr_mode_weightings[index1]; }; - return_field[index1] *= phase_shift_and_scale; - if (ptr_mode_weightings[index1]0){ - size_t index1; - complex weight; - size_t ksquared; - int kx,ky,kz; + if (descriptor_kk_limit > 0) + { + size_t index1; + complex weight; + size_t ksquared; + int kx, ky, kz; #ifdef USE_OPENMP -#pragma omp parallel for collapse(3) \ - private (index1,kx,ky,kz,ksquared,weight) +#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, ksquared, weight) #endif - for(int ix=0;ixnfft_dim/2) ? - ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; - ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy; - kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz; - ksquared = kx*kx + ky*ky + kz*kz; - if ( (kx!=nfft_dim/2)&&(ky!=nfft_dim/2)&&(kz!=nfft_dim/2)){ //Omit Nyquist modes + for (int ix = 0; ix < local_n0_fourier_return; ix++) + for (int iy = 0; iy < nfft_dim; iy++) + for (int iz = 0; iz <= nfft_dim / 2; iz++) + { + kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return; + ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy; + kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz; + ksquared = kx * kx + ky * ky + kz * kz; + if ((kx != nfft_dim / 2) && (ky != nfft_dim / 2) && (kz != nfft_dim / 2)) + { //Omit Nyquist modes - if ((ksquared<=descriptor_kk_limit)&&(ksquared!=0)){ - index1 = ix*N0_fourier_grid*(N0_fourier_grid/2+1) + iy*(N0_fourier_grid/2+1) + iz; - weight = cabs(return_field[index1]); - return_field[index1] /= weight; - }; - }; - }; - }; - - //printf("Reached here 12!\n"); + if ((ksquared <= descriptor_kk_limit) && (ksquared != 0)) + { + index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz; + weight = cabs(return_field[index1]); + return_field[index1] /= weight; + }; + }; + }; + }; + //printf("Reached here 12!\n"); -int rank; - MPI_Comm_rank(MPI_COMM_WORLD,&rank); + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); char filename[100]; - sprintf(filename,"output_k_space_alt.%d",rank); + sprintf(filename, "output_k_space_alt.%d", rank); FILE *fp; + if (nfft_dim < -1) + { + FILE *fp; -if (nfft_dim <-1){ + fp = fopen(filename, "w"); + for (int ix = 0; ix < local_n0_fourier_return; ix++) + for (int iy = 0; iy < nfft_dim; iy++) + for (int iz = 0; iz <= nfft_dim / 2; iz++) + { - - FILE *fp; + int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz; + fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz, + creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index])); + // ptr_mode_weightings[index]); + }; + fclose(fp); + } + else + { - fp = fopen(filename,"w"); - - for (int ix=0; ix63)){ - printf("Level %llu is out of range (0-63)!\n",l); // Not part of Panphasia - abort(); - }; + if ((l < 0) || (l > 63)) + { + printf("Level %lu is out of range (0-63)!\n", l); // Not part of Panphasia + abort(); + }; - if ((j1>>l!=0)||(j2>>l!=0)||(j3>>l!=0)){ // Cell outside of Panphasia - printf("Level %llu: Cell coordinate out of range (%llu,%llu,%llu)\n",l,j1,j2,j3); - abort(); - }; + if ((j1 >> l != 0) || (j2 >> l != 0) || (j3 >> l != 0)) + { // Cell outside of Panphasia + printf("Level %lu: Cell coordinate out of range (%lu,%lu,%lu)\n", l, j1, j2, j3); + abort(); + }; - // Only allow a non-zero value for the seed if the safety catch has a specific value + // Only allow a non-zero value for the seed if the safety catch has a specific value - if (allow_non_zero_seed_saftey_catch != 1002003004005006007){ - seed_value = 0; - }; + if (allow_non_zero_seed_saftey_catch != 1002003004005006007) + { + seed_value = 0; + }; - size_t root_cell_calculation=0; - //============================================================================= - // Exception - for computing the parent properties of the root cell only - //============================================================================= - if ((allow_non_zero_seed_saftey_catch == 1002003004005006007)&&(seed_value == 1000000000999)){ - l=0; - j0 = (p_order<<60); - j1 = 2; - j2 = 2; - j3 = 2; - if (p_order>8){printf("Multipole order too high\n");abort();}; - seed_value = 0; - root_cell_calculation = 1; //Signal root cell properties are being calculated - }; - //=================================================== + size_t root_cell_calculation = 0; + //============================================================================= + // Exception - for computing the parent properties of the root cell only + //============================================================================= + if ((allow_non_zero_seed_saftey_catch == 1002003004005006007) && (seed_value == 1000000000999)) + { + l = 0; + j0 = (p_order << 60); + j1 = 2; + j2 = 2; + j3 = 2; + if (p_order > 8) + { + printf("Multipole order too high\n"); + abort(); + }; + seed_value = 0; + root_cell_calculation = 1; //Signal root cell properties are being calculated + }; + //=================================================== + if (seed_value >> 32 != 0) + { + printf("Seed value %lu, outside range 0 <= seed <2^32 \n", seed_value); + abort(); + }; - if (seed_value>>32!=0){ - printf("Seed value %llu, outside range 0 <= seed <2^32 \n",seed_value); - abort(); - }; + // END ERROR CHECKING // - // END ERROR CHECKING // + int nloop = Nbasis; // Generate eight uniform randoms per call of Threefry4x64 // + size_t k0, k1, k2, k3; - int nloop = Nbasis; // Generate eight uniform randoms per call of Threefry4x64 // + j0 = (p_order << 60) + ((l << 56) >> 4) + ((seed_value << 32) >> 12); - - size_t k0,k1,k2,k3; - - j0 = (p_order<<60) + ((l<<56)>>4) + ((seed_value<<32)>>12); - k0 = j0; k1 = j1; k2 = j2; k3 = j3; - if ((root_cell_calculation)&&(verbose_warnings_only!=1)){ + if ((root_cell_calculation) && (verbose_warnings_only != 1)) + { printf("============================================================================================\n"); printf("Computing root cell properties\n"); - printf("p_order, l, seed_value: (j0,j1,j2,j3),%llx %llx %llx (%llx,%llx,%llx,%llx)\n", - p_order,l,seed_value,j0,j1,j2,j3); - printf("Encoded root cell values:(k0,k1,k2,k3):\n (%llx,%llx,%llx,%llx)\n",k0,k1,k2,k3); - printf("============================================================================================\n"); - }; - - + printf("p_order, l, seed_value: (j0,j1,j2,j3),%llx %lx %lx (%lx,%lx,%lx,%lx)\n", + p_order, l, seed_value, j0, j1, j2, j3); + printf("Encoded root cell values:(k0,k1,k2,k3):\n (%lx,%lx,%lx,%lx)\n", k0, k1, k2, k3); + printf("============================================================================================\n"); + }; ctr.v[0] = k0; ctr.v[1] = k1; ctr.v[2] = k2; ctr.v[3] = k3; - ncount = 0; - + for (i = 0; i < nloop; ++i) + { - for(i=0; i> 32; + out_int[1] = (rand.v[0] << 32) >> 32; - out_int[0] = rand.v[0]>>32; - out_int[1] = (rand.v[0]<<32)>>32; + out_int[2] = rand.v[1] >> 32; + out_int[3] = (rand.v[1] << 32) >> 32; - out_int[2] = rand.v[1]>>32; - out_int[3] = (rand.v[1]<<32)>>32; + out_int[4] = rand.v[2] >> 32; + out_int[5] = (rand.v[2] << 32) >> 32; - out_int[4] = rand.v[2]>>32; - out_int[5] = (rand.v[2]<<32)>>32; + out_int[6] = rand.v[3] >> 32; + out_int[7] = (rand.v[3] << 32) >> 32; - out_int[6] = rand.v[3]>>32; - out_int[7] = (rand.v[3]<<32)>>32; + for (j = 0; j < 8; ++j) + unif_real[ncount++] = (((double)out_int[j] + g_shift) * g_scale); + }; - + for (i = 0; i < 8 * Nbasis; i++) + panphasia_randoms[i] = unif_real[i]; - for (j=0; j<8;++j) unif_real[ncount++] = ( ((double)out_int[j] + g_shift)*g_scale); + // Exceptional branch with the aim ultimately of filling the Gaussian tail. + // Executed rarely so does not need to be particularly efficient. + // For this reason it include an error check. Can the value + // that triggered this loop be reproduced? If it cannot, the code aborts. - }; - + size_t branch_value = 4096; + PAN_REAL branching_ratio = (((double)branch_value) * g_scale); - - for (i=0; i<8*Nbasis; i++) panphasia_randoms[i] = unif_real[i]; + //PAN_REAL branching_ratio = -0.3; - - // Exceptional branch with the aim ultimately of filling the Gaussian tail. - // Executed rarely so does not need to be particularly efficient. - // For this reason it include an error check. Can the value - // that triggered this loop be reproduced? If it cannot, the code aborts. - - size_t branch_value = 4096; - PAN_REAL branching_ratio = ( ((double)branch_value)*g_scale); + for (size_t i = 0; i < 8 * Nbasis; i += 2) + if (panphasia_randoms[i] < branching_ratio) + { - //PAN_REAL branching_ratio = -0.3; - - for (size_t i=0; i<8*Nbasis; i+=2) if (panphasia_randoms[i]> 4) + ((seed_value << 32) >> 12); + //code_cell(j0,j1,j2,j3,&k0,&k1,&k2,&k3); - j0 = (p_order<<60) + ((l<<56)>>4) + ((seed_value<<32)>>12); - //code_cell(j0,j1,j2,j3,&k0,&k1,&k2,&k3); + k0 = j0; + k1 = j1; + k2 = j2; + k3 = j3; - k0 = j0; - k1 = j1; - k2 = j2; - k3 = j3; + ctr.v[0] = k0 + iind; + ctr.v[1] = k1; + ctr.v[2] = k2; + ctr.v[3] = k3; - ctr.v[0] = k0+iind; - ctr.v[1] = k1; - ctr.v[2] = k2; - ctr.v[3] = k3; + // ctr.v[0] = k0+iind*increment; + // ctr.v[1] = k1+iind*increment; + //ctr.v[2] = k2+iind*increment; + //ctr.v[3] = k3+iind*increment; - // ctr.v[0] = k0+iind*increment; - // ctr.v[1] = k1+iind*increment; - //ctr.v[2] = k2+iind*increment; - //ctr.v[3] = k3+iind*increment; + rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS, ctr, panphasia_key); - rand = threefry4x64_R(NUMBER_THREEFRY_ROUNDS ,ctr, panphasia_key); + out_int[0] = rand.v[0] >> 32; + out_int[1] = (rand.v[0] << 32) >> 32; - out_int[0] = rand.v[0]>>32; - out_int[1] = (rand.v[0]<<32)>>32; + out_int[2] = rand.v[1] >> 32; + out_int[3] = (rand.v[1] << 32) >> 32; - out_int[2] = rand.v[1]>>32; - out_int[3] = (rand.v[1]<<32)>>32; + out_int[4] = rand.v[2] >> 32; + out_int[5] = (rand.v[2] << 32) >> 32; - out_int[4] = rand.v[2]>>32; - out_int[5] = (rand.v[2]<<32)>>32; + out_int[6] = rand.v[3] >> 32; + out_int[7] = (rand.v[3] << 32) >> 32; - out_int[6] = rand.v[3]>>32; - out_int[7] = (rand.v[3]<<32)>>32; + new_value = (((double)out_int[jind] + g_shift) * g_scale); - new_value = ( ((double)out_int[jind] + g_shift)*g_scale); + if (loop == 0) + { + if (new_value != panphasia_randoms[i]) + { + printf("Failure to reproduce the initial random that triggered this branch - a serious error!\n"); + abort(); + } + } + else + { + if (new_value >= branching_ratio) + { - if (loop==0){ - if (new_value != panphasia_randoms[i]){ - printf("Failure to reproduce the initial random that triggered this branch - a serious error!\n"); - abort(); - }}else{ + replacement_value *= new_value; + } + else + { - if (new_value>=branching_ratio){ + size_t counter = 0; - replacement_value *= new_value; - }else{ + while ((new_value < branching_ratio) && (counter < 7)) + { + replacement_value *= branching_ratio; + counter++; + jind = (++jind) % 8; + new_value = (((double)out_int[jind] + g_shift) * g_scale); + }; + replacement_value *= new_value; + //if (new_value -long long int compute_check_digit_(){ +long long int compute_check_digit_() +{ char str[200]; long long int check_digit; @@ -485,15 +500,15 @@ long long int compute_check_digit_(){ threefry4x64_ctr_t ctr, rand; threefry4x64_key_t key; - if (descriptor_read_in==0){ + if (descriptor_read_in == 0) + { printf("No descriptor has been set\n"); abort(); }; - - sprintf(str, "%llu%llu%llu%llu%llu%llu%llu%s",descriptor_order,descriptor_base_level, - descriptor_xorigin,descriptor_yorigin,descriptor_zorigin, - descriptor_base_size,descriptor_kk_limit,descriptor_name); + sprintf(str, "%lu%lu%lu%lu%lu%lu%lu%s", descriptor_order, descriptor_base_level, + descriptor_xorigin, descriptor_yorigin, descriptor_zorigin, + descriptor_base_size, descriptor_kk_limit, descriptor_name); key = key_constant; @@ -502,19 +517,17 @@ long long int compute_check_digit_(){ ctr.v[2] = 0; ctr.v[3] = 0; - - for (int i =0; i>32); - - return(check_digit); -}; + check_digit = (ctr.v[0] >> 32); + return (check_digit); +} ////////////////////////////////////////////////////////////////////////////// // Construct pairs of overlapping random descriptors for testing @@ -522,19 +535,14 @@ long long int compute_check_digit_(){ // essentially identical. ////////////////////////////////////////////////////////////////////////////// - void test_propogation_of_moments_(int iterations) { - - - const int level_max=62; - + const int level_max = 62; threefry4x64_ctr_t ctr, rand; threefry4x64_key_t key; - key = key_constant; ctr.v[0] = 0; @@ -542,190 +550,193 @@ void test_propogation_of_moments_(int iterations) ctr.v[2] = 0; ctr.v[3] = 0; - int levplus=1; - if (iterations==0){ + int levplus = 1; + if (iterations == 0) + { iterations = 1; - levplus = 5; - + levplus = 5; }; - for(int it=0; it0){ - level_cell = level_desc1 + ctr.v[0]%(level_max-level_desc1); - } - else{ - level_cell = level_desc1; - }; + if (level_max - level_desc1 > 0) + { + level_cell = level_desc1 + ctr.v[0] % (level_max - level_desc1); + } + else + { + level_cell = level_desc1; + }; - if (level_cell-level_desc1>0){ - level_desc2 = level_desc1 + ctr.v[1]%(level_cell-level_desc1); + if (level_cell - level_desc1 > 0) + { + level_desc2 = level_desc1 + ctr.v[1] % (level_cell - level_desc1); + } + else + { + level_desc2 = level_desc1; + }; - }else{ - level_desc2 = level_desc1; - }; + size_t side_length2 = (size_t)1 << level_desc2; + size_t side_length3 = (size_t)1 << level_cell; - - size_t side_length2 = (size_t)1<> (64 - level_cell); + size_t ycell = ctr.v[1] >> (64 - level_cell); + size_t zcell = ctr.v[2] >> (64 - level_cell); - size_t xcell = ctr.v[0]>>(64-level_cell); - size_t ycell = ctr.v[1]>>(64-level_cell); - size_t zcell = ctr.v[2]>>(64-level_cell); + //size_t cell_level_size = (size_t)1<> (level_cell - level_desc1)) - dx1 + side_length1) % side_length1; + size_t desc1_y = ((ycell >> (level_cell - level_desc1)) - dy1 + side_length1) % side_length1; + size_t desc1_z = ((zcell >> (level_cell - level_desc1)) - dz1 + side_length1) % side_length1; + size_t desc2_x = ((xcell >> (level_cell - level_desc2)) - dx2 + side_length2) % side_length2; + size_t desc2_y = ((ycell >> (level_cell - level_desc2)) - dy2 + side_length2) % side_length2; + size_t desc2_z = ((zcell >> (level_cell - level_desc2)) - dz2 + side_length2) % side_length2; - size_t desc1_x = ((xcell>>(level_cell-level_desc1))-dx1+side_length1)%side_length1; - size_t desc1_y = ((ycell>>(level_cell-level_desc1))-dy1+side_length1)%side_length1; - size_t desc1_z = ((zcell>>(level_cell-level_desc1))-dz1+side_length1)%side_length1; + char descriptor1[300]; + char descriptor2[300]; - size_t desc2_x = ((xcell>>(level_cell-level_desc2))-dx2+side_length2)%side_length2; - size_t desc2_y = ((ycell>>(level_cell-level_desc2))-dy2+side_length2)%side_length2; - size_t desc2_z = ((zcell>>(level_cell-level_desc2))-dz2+side_length2)%side_length2; + sprintf(descriptor1, "[Panph%lld,L%ld,(%lu,%lu,%lu),S%lu,CH-999,test]", + p_order, level_desc1, desc1_x, desc1_y, desc1_z, desc1_s); + sprintf(descriptor2, "[Panph%lld,L%ld,(%lu,%lu,%lu),S%lu,CH-999,test]", + p_order, level_desc2, desc2_x, desc2_y, desc2_z, desc2_s); + //printf("Descriptor 1: %s\nDescriptor 2: %s\n",descriptor1,descriptor2); -char descriptor1[300]; -char descriptor2[300]; + rand = threefry4x64(ctr, key); + ctr = rand; - sprintf(descriptor1,"[Panph%ld,L%ld,(%llu,%llu,%llu),S%llu,CH-999,test]", - p_order,level_desc1,desc1_x,desc1_y,desc1_z,desc1_s); - - sprintf(descriptor2,"[Panph%ld,L%ld,(%llu,%llu,%llu),S%llu,CH-999,test]", - p_order,level_desc2,desc2_x,desc2_y,desc2_z,desc2_s); + size_t rel_level1 = level_cell - level_desc1; + size_t rel_level2 = level_cell - level_desc2; - //printf("Descriptor 1: %s\nDescriptor 2: %s\n",descriptor1,descriptor2); + size_t xstart1 = (xcell - (desc1_x << rel_level1) + side_length3) % side_length3; + size_t ystart1 = (ycell - (desc1_y << rel_level1) + side_length3) % side_length3; + size_t zstart1 = (zcell - (desc1_z << rel_level1) + side_length3) % side_length3; + size_t xstart2 = (xcell - (desc2_x << rel_level2) + side_length3) % side_length3; + size_t ystart2 = (ycell - (desc2_y << rel_level2) + side_length3) % side_length3; + size_t zstart2 = (zcell - (desc2_z << rel_level2) + side_length3) % side_length3; - rand = threefry4x64(ctr, key); ctr = rand; + size_t extent1 = 1; + size_t extent2 = 1; + if (ctr.v[0] % 2 == 0) + { + rel_level1++; + xstart1 *= 2; + ystart1 *= 2; + zstart1 *= 2; + extent1 *= 2; + } + else + { + rel_level2++; + xstart2 *= 2; + ystart2 *= 2; + zstart2 *= 2; + extent2 *= 2; + }; - size_t rel_level1 = level_cell - level_desc1; - size_t rel_level2 = level_cell - level_desc2; + //printf("1: relative level %llu Rel offset: (%llu %llu %llu) Extent %llu\n",rel_level1,xstart1, + // ystart1,zstart1,extent1); - size_t xstart1 = (xcell-(desc1_x< max_diff2) + max_diff2 = diff2; + rms_diff2 += diff2; + }; + // printf("%s\n%s\n",descriptor1,descriptor2); + //printf("Example coeff %18.12lf %18.12lf \n",coefficients1[0],coefficients2[0]); - //printf("1: relative level %llu Rel offset: (%llu %llu %llu) Extent %llu\n",rel_level1,xstart1, - // ystart1,zstart1,extent1); + rms_diff2 /= (double)Nbasis; - //printf("2: relative level %llu Rel offset: (%llu %llu %llu) Extent %llu\n",rel_level2,xstart2, - // ystart2,zstart2,extent2); + if ((sizeof(PAN_REAL) == 4) || (sizeof(FFTW_REAL) == 4)) + { + if ((max_diff2 > 1.e-12) || (rms_diff2 > 1.e-12)) + { + printf("Moments not accurately recovered at single precision\n"); + abort(); + }; + } + else + { -verbose_warnings_only=1; // Minimize output to screen. + if ((max_diff2 > 1.e-24) || (rms_diff2 > 1.e-24)) + { + printf("Moments not accurately recovered at double precision\n"); + abort(); + }; + }; - - - double max_diff2=0.0; - double rms_diff2=0.0; - - double coefficients1[Nbasis]; - double coefficients2[Nbasis]; - - - - - test_cell_moments(descriptor1,rel_level1,xstart1,ystart1,zstart1,extent1,coefficients1); - test_cell_moments(descriptor2,rel_level2,xstart2,ystart2,zstart2,extent2,coefficients2); - - - for (int i=0; imax_diff2) max_diff2=diff2; - rms_diff2+=diff2; - }; - - // printf("%s\n%s\n",descriptor1,descriptor2); - - //printf("Example coeff %18.12lf %18.12lf \n",coefficients1[0],coefficients2[0]); - - rms_diff2/=(double)Nbasis; - - - if ((sizeof(PAN_REAL)==4)||(sizeof(FFTW_REAL)==4)){ - - if ((max_diff2>1.e-12)||(rms_diff2>1.e-12)){ - printf("Moments not accurately recovered at single precision\n"); abort(); + // printf("lev %d Acceptable differences: %e RMS difference %e\n",lev,sqrt(max_diff2),sqrt(rms_diff2)); }; - }else{ - - if ((max_diff2>1.e-24)||(rms_diff2>1.e-24)){ - printf("Moments not accurately recovered at double precision\n"); abort(); - }; - - }; - - // printf("lev %d Acceptable differences: %e RMS difference %e\n",lev,sqrt(max_diff2),sqrt(rms_diff2)); - + // printf("Test of descriptors/relative coordinates and moments PASSED.\n"); }; - - // printf("Test of descriptors/relative coordinates and moments PASSED.\n"); - }; -}; - +} ///////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// @@ -734,299 +745,560 @@ verbose_warnings_only=1; // Minimize output to screen. ///////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// - void inverse_threefry4x64_test_(int verbose) { -threefry4x64_ctr_t ctr = {{0x243f6a8885a308d3 , 0x13198a2e03707344 , 0xa4093822299f31d0 , 0x082efa98ec4e6c89 } }; ; -threefry4x64_key_t key = {{0x452821e638d01377 , 0xbe5466cf34e90c6c , 0xbe5466cf34e90c6c , 0xc0ac29b7c97c50dd} }; ; -threefry4x64_ctr_t rand1,rand2; + threefry4x64_ctr_t ctr = {{0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0, 0x082efa98ec4e6c89}}; + ; + threefry4x64_key_t key = {{0x452821e638d01377, 0xbe5466cf34e90c6c, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd}}; + ; + threefry4x64_ctr_t rand1, rand2; - for (size_t ROUNDS = 0; ROUNDS<21; ROUNDS++){ + for (size_t ROUNDS = 0; ROUNDS < 21; ROUNDS++) + { - rand1 = threefry4x64_R(ROUNDS, ctr, key); + rand1 = threefry4x64_R(ROUNDS, ctr, key); - rand2 = arj_threefry4x64(ROUNDS, ctr, key); + rand2 = arj_threefry4x64(ROUNDS, ctr, key); - if( (rand1.v[0]!=rand2.v[0])||(rand1.v[1]!=rand2.v[1])||(rand1.v[2]!=rand2.v[2])||(rand1.v[3]!=rand2.v[3])){ - printf("Error in arj_threefry4x64 - failing to reproduce Threefry4x64 generator!!!\n"); - abort(); + if ((rand1.v[0] != rand2.v[0]) || (rand1.v[1] != rand2.v[1]) || (rand1.v[2] != rand2.v[2]) || (rand1.v[3] != rand2.v[3])) + { + printf("Error in arj_threefry4x64 - failing to reproduce Threefry4x64 generator!!!\n"); + abort(); + }; + + rand2 = inverse_arj_threefry4x64(ROUNDS, rand1, key); + + if ((ctr.v[0] != rand2.v[0]) || (ctr.v[1] != rand2.v[1]) || (ctr.v[2] != rand2.v[2]) || (ctr.v[3] != rand2.v[3])) + { + printf("Error in arj_threefry4x64 - failing to reproduce INVERSE Threefry4x64 generator!!!\n"); + abort(); + }; }; - rand2 = inverse_arj_threefry4x64(ROUNDS, rand1, key); - - if( (ctr.v[0]!=rand2.v[0])||(ctr.v[1]!=rand2.v[1])||(ctr.v[2]!=rand2.v[2])||(ctr.v[3]!=rand2.v[3])){ - printf("Error in arj_threefry4x64 - failing to reproduce INVERSE Threefry4x64 generator!!!\n"); - abort(); - }; - - - - }; - return; } - -threefry4x64_ctr_t arj_threefry4x64(size_t R,threefry4x64_ctr_t ctr, - threefry4x64_key_t key){ - size_t x0 = ctr.v[0]; size_t x1 = ctr.v[1]; size_t x2 = ctr.v[2]; size_t x3 = ctr.v[3]; - size_t k0 = key.v[0]; size_t k1 = key.v[1]; size_t k2 = key.v[2]; size_t k3 = key.v[3]; +threefry4x64_ctr_t arj_threefry4x64(size_t R, threefry4x64_ctr_t ctr, + threefry4x64_key_t key) +{ + size_t x0 = ctr.v[0]; + size_t x1 = ctr.v[1]; + size_t x2 = ctr.v[2]; + size_t x3 = ctr.v[3]; + size_t k0 = key.v[0]; + size_t k1 = key.v[1]; + size_t k2 = key.v[2]; + size_t k3 = key.v[3]; size_t k4 = 0x1bd11bdaa9fc1a22; -//--------------------------------------- + //--------------------------------------- + if (R > 20) + abort(); - if (R>20) abort(); + k4 ^= k0; + k4 ^= k1; + k4 ^= k2; + k4 ^= k3; + x0 += k0; + x1 += k1; + x2 += k2; + x3 += k3; -k4^=k0; k4^=k1; k4^=k2; k4^=k3; -x0+=k0;x1+=k1;x2+=k2;x3+=k3; + if (R > 0) + { + x0 += x1; + x1 = (x1 << 14) | (x1 >> 50); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 16) | (x3 >> 48); + x3 ^= x2; + }; + if (R > 1) + { + x0 += x3; + x3 = (x3 << 52) | (x3 >> 12); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 57) | (x1 >> 7); + x1 ^= x2; + }; + if (R > 2) + { + x0 += x1; + x1 = (x1 << 23) | (x1 >> 41); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 40) | (x3 >> 24); + x3 ^= x2; + }; -if (R>0){ - x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; - x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; - }; -if (R>1){ - x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; - x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; -}; + if (R > 3) + { + x0 += x3; + x3 = (x3 << 5) | (x3 >> 59); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 37) | (x1 >> 27); + x1 ^= x2; + //Inject key 1 + x0 += k1; + x1 += k2; + x2 += k3; + x3 += k4; + x3 += 1; + }; -if (R>2){ - x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; - x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; -}; + if (R > 4) + { + x0 += x1; + x1 = (x1 << 25) | (x1 >> 39); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 33) | (x3 >> 31); + x3 ^= x2; + }; -if (R>3){ - x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; - x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; - //Inject key 1 - x0+=k1; x1+=k2; x2+=k3; x3+=k4; x3+=1; -}; + if (R > 5) + { + x0 += x3; + x3 = (x3 << 46) | (x3 >> 18); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 12) | (x1 >> 52); + x1 ^= x2; + }; -if (R>4){ - x0+=x1; x1 = (x1<<25)|(x1>>39); x1^=x0; - x2+=x3; x3 = (x3<<33)|(x3>>31); x3^=x2; -}; + if (R > 6) + { + x0 += x1; + x1 = (x1 << 58) | (x1 >> 6); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 22) | (x3 >> 42); + x3 ^= x2; + }; -if (R>5){ - x0+=x3; x3 = (x3<<46)|(x3>>18); x3^=x0; - x2+=x1; x1 = (x1<<12)|(x1>>52); x1^=x2; -}; + if (R > 7) + { + x0 += x3; + x3 = (x3 << 32) | (x3 >> 32); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 32) | (x1 >> 32); + x1 ^= x2; + //Inject key 2 + x0 += k2; + x1 += k3; + x2 += k4; + x3 += k0; + x3 += 2; + }; -if (R>6){ - x0+=x1; x1 = (x1<<58)|(x1>> 6); x1^=x0; - x2+=x3; x3 = (x3<<22)|(x3>>42); x3^=x2; -}; + if (R > 8) + { + x0 += x1; + x1 = (x1 << 14) | (x1 >> 50); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 16) | (x3 >> 48); + x3 ^= x2; + }; -if (R>7){ - x0+=x3; x3 = (x3<<32)|(x3>>32); x3^=x0; - x2+=x1; x1 = (x1<<32)|(x1>>32); x1^=x2; - //Inject key 2 - x0+=k2; x1+=k3; x2+=k4; x3+=k0; x3+=2; -}; + if (R > 9) + { + x0 += x3; + x3 = (x3 << 52) | (x3 >> 12); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 57) | (x1 >> 7); + x1 ^= x2; + }; -if (R>8){ - x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; - x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; -}; + if (R > 10) + { + x0 += x1; + x1 = (x1 << 23) | (x1 >> 41); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 40) | (x3 >> 24); + x3 ^= x2; + }; -if (R>9){ - x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; - x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; -}; + if (R > 11) + { + x0 += x3; + x3 = (x3 << 5) | (x3 >> 59); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 37) | (x1 >> 27); + x1 ^= x2; + //Inject key 3 + x0 += k3; + x1 += k4; + x2 += k0; + x3 += k1; + x3 += 3; + }; -if (R>10){ - x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; - x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; -}; + if (R > 12) + { + x0 += x1; + x1 = (x1 << 25) | (x1 >> 39); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 33) | (x3 >> 31); + x3 ^= x2; + }; -if (R>11){ - x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; - x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; - //Inject key 3 - x0+=k3; x1+=k4; x2+=k0; x3+=k1; x3+=3; -}; + if (R > 13) + { + x0 += x3; + x3 = (x3 << 46) | (x3 >> 18); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 12) | (x1 >> 52); + x1 ^= x2; + }; -if (R>12){ - x0+=x1; x1 = (x1<<25)|(x1>>39); x1^=x0; - x2+=x3; x3 = (x3<<33)|(x3>>31); x3^=x2; -}; + if (R > 14) + { + x0 += x1; + x1 = (x1 << 58) | (x1 >> 6); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 22) | (x3 >> 42); + x3 ^= x2; + }; -if (R>13){ - x0+=x3; x3 = (x3<<46)|(x3>>18); x3^=x0; - x2+=x1; x1 = (x1<<12)|(x1>>52); x1^=x2; -}; + if (R > 15) + { + x0 += x3; + x3 = (x3 << 32) | (x3 >> 32); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 32) | (x1 >> 32); + x1 ^= x2; + //Inject key 4 + x0 += k4; + x1 += k0; + x2 += k1; + x3 += k2; + x3 += 4; + }; -if (R>14){ - x0+=x1; x1 = (x1<<58)|(x1>> 6); x1^=x0; - x2+=x3; x3 = (x3<<22)|(x3>>42); x3^=x2; -}; + if (R > 16) + { + x0 += x1; + x1 = (x1 << 14) | (x1 >> 50); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 16) | (x3 >> 48); + x3 ^= x2; + }; -if (R>15){ - x0+=x3; x3 = (x3<<32)|(x3>>32); x3^=x0; - x2+=x1; x1 = (x1<<32)|(x1>>32); x1^=x2; - //Inject key 4 - x0+=k4; x1+=k0; x2+=k1; x3+=k2; x3+=4; -}; + if (R > 17) + { + x0 += x3; + x3 = (x3 << 52) | (x3 >> 12); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 57) | (x1 >> 7); + x1 ^= x2; + }; + if (R > 18) + { + x0 += x1; + x1 = (x1 << 23) | (x1 >> 41); + x1 ^= x0; + x2 += x3; + x3 = (x3 << 40) | (x3 >> 24); + x3 ^= x2; + }; + if (R > 19) + { + x0 += x3; + x3 = (x3 << 5) | (x3 >> 59); + x3 ^= x0; + x2 += x1; + x1 = (x1 << 37) | (x1 >> 27); + x1 ^= x2; + //Inject key 5 + x0 += k0; + x1 += k1; + x2 += k2; + x3 += k3; + x3 += 5; + }; + //--------------------------------------- + threefry4x64_ctr_t result = {{x0, x1, x2, x3}}; + return (result); +} -if (R>16){ - x0+=x1; x1 = (x1<<14)|(x1>>50); x1^=x0; - x2+=x3; x3 = (x3<<16)|(x3>>48); x3^=x2; -}; +threefry4x64_ctr_t inverse_arj_threefry4x64(size_t R, threefry4x64_ctr_t ctr, + threefry4x64_key_t key) +{ -if (R>17){ - x0+=x3; x3 = (x3<<52)|(x3>>12); x3^=x0; - x2+=x1; x1 = (x1<<57)|(x1>>7); x1^=x2; -}; -if (R>18){ - x0+=x1; x1 = (x1<<23)|(x1>>41); x1^=x0; - x2+=x3; x3 = (x3<<40)|(x3>>24); x3^=x2; -}; -if (R>19){ - x0+=x3; x3 = (x3<<5)|(x3>>59); x3^=x0; - x2+=x1; x1 = (x1<<37)|(x1>>27); x1^=x2; - //Inject key 5 - x0+=k0; x1+=k1; x2+=k2; x3+=k3; x3+=5; -}; -//--------------------------------------- - threefry4x64_ctr_t result = {{x0,x1,x2,x3}}; - return(result); - -}; - -threefry4x64_ctr_t inverse_arj_threefry4x64(size_t R,threefry4x64_ctr_t ctr, - threefry4x64_key_t key){ - - size_t x0 = ctr.v[0]; size_t x1 = ctr.v[1]; size_t x2 = ctr.v[2]; size_t x3 = ctr.v[3]; - size_t k0 = key.v[0]; size_t k1 = key.v[1]; size_t k2 = key.v[2]; size_t k3 = key.v[3]; + size_t x0 = ctr.v[0]; + size_t x1 = ctr.v[1]; + size_t x2 = ctr.v[2]; + size_t x3 = ctr.v[3]; + size_t k0 = key.v[0]; + size_t k1 = key.v[1]; + size_t k2 = key.v[2]; + size_t k3 = key.v[3]; size_t k4 = 0x1bd11bdaa9fc1a22; -//--------------------------------------- + //--------------------------------------- -if (R>20) abort(); + if (R > 20) + abort(); - k4^=k0; k4^=k1; k4^=k2; k4^=k3; + k4 ^= k0; + k4 ^= k1; + k4 ^= k2; + k4 ^= k3; -if (R>19){ - //Anti-inject key 5 - x0-=k0; x1-=k1; x2-=k2; x3-=k3; x3-=5; - x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; - x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; -}; + if (R > 19) + { + //Anti-inject key 5 + x0 -= k0; + x1 -= k1; + x2 -= k2; + x3 -= k3; + x3 -= 5; + x3 ^= x0; + x3 = (x3 << 59) | (x3 >> 5); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 27) | (x1 >> 37); + x2 -= x1; + }; -if (R>18){ - x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; - x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; -}; + if (R > 18) + { + x3 ^= x2; + x3 = (x3 << 24) | (x3 >> 40); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 41) | (x1 >> 23); + x0 -= x1; + }; -if (R>17){ - x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; - x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; -}; + if (R > 17) + { + x3 ^= x0; + x3 = (x3 << 12) | (x3 >> 52); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 7) | (x1 >> 57); + x2 -= x1; + }; -if (R>16){ - x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; - x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; -}; + if (R > 16) + { + x3 ^= x2; + x3 = (x3 << 48) | (x3 >> 16); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 50) | (x1 >> 14); + x0 -= x1; + }; -if (R>15){ - //Anti-inject key 4 - x0-=k4; x1-=k0; x2-=k1; x3-=k2; x3-=4; - x3^=x0; x3 = (x3<<32)|(x3>>32); x0-=x3; - x1^=x2; x1 = (x1<<32)|(x1>>32); x2-=x1; -}; + if (R > 15) + { + //Anti-inject key 4 + x0 -= k4; + x1 -= k0; + x2 -= k1; + x3 -= k2; + x3 -= 4; + x3 ^= x0; + x3 = (x3 << 32) | (x3 >> 32); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 32) | (x1 >> 32); + x2 -= x1; + }; -if (R>14){ - x3^=x2; x3 = (x3<<42)|(x3>>22); x2-=x3; - x1^=x0; x1 = (x1<< 6)|(x1>>58); x0-=x1; -}; + if (R > 14) + { + x3 ^= x2; + x3 = (x3 << 42) | (x3 >> 22); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 6) | (x1 >> 58); + x0 -= x1; + }; -if (R>13){ - x3^=x0; x3 = (x3<<18)|(x3>>46); x0-=x3; - x1^=x2; x1 = (x1<<52)|(x1>>12); x2-=x1; -}; + if (R > 13) + { + x3 ^= x0; + x3 = (x3 << 18) | (x3 >> 46); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 52) | (x1 >> 12); + x2 -= x1; + }; -if (R>12){ - x3^=x2; x3 = (x3<<31)|(x3>>33); x2-=x3; - x1^=x0; x1 = (x1<<39)|(x1>>25); x0-=x1; -}; + if (R > 12) + { + x3 ^= x2; + x3 = (x3 << 31) | (x3 >> 33); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 39) | (x1 >> 25); + x0 -= x1; + }; -if (R>11){ - //Anti-inject key 3 - x0-=k3; x1-=k4; x2-=k0; x3-=k1; x3-=3; - x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; - x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; + if (R > 11) + { + //Anti-inject key 3 + x0 -= k3; + x1 -= k4; + x2 -= k0; + x3 -= k1; + x3 -= 3; + x3 ^= x0; + x3 = (x3 << 59) | (x3 >> 5); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 27) | (x1 >> 37); + x2 -= x1; + }; -}; + if (R > 10) + { + x3 ^= x2; + x3 = (x3 << 24) | (x3 >> 40); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 41) | (x1 >> 23); + x0 -= x1; + }; -if (R>10){ - x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; - x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; -}; + if (R > 9) + { + x3 ^= x0; + x3 = (x3 << 12) | (x3 >> 52); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 7) | (x1 >> 57); + x2 -= x1; + }; -if (R>9){ - x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; - x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; -}; + if (R > 8) + { + x3 ^= x2; + x3 = (x3 << 48) | (x3 >> 16); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 50) | (x1 >> 14); + x0 -= x1; + }; -if (R>8){ - x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; - x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; -}; + if (R > 7) + { + //Anti-inject key 2 + x0 -= k2; + x1 -= k3; + x2 -= k4; + x3 -= k0; + x3 -= 2; + x3 ^= x0; + x3 = (x3 << 32) | (x3 >> 32); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 32) | (x1 >> 32); + x2 -= x1; + }; -if (R>7){ - //Anti-inject key 2 - x0-=k2; x1-=k3; x2-=k4; x3-=k0; x3-=2; - x3^=x0; x3 = (x3<<32)|(x3>>32); x0-=x3; - x1^=x2; x1 = (x1<<32)|(x1>>32); x2-=x1; - }; + if (R > 6) + { + x3 ^= x2; + x3 = (x3 << 42) | (x3 >> 22); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 6) | (x1 >> 58); + x0 -= x1; + }; -if (R>6){ - x3^=x2; x3 = (x3<<42)|(x3>>22); x2-=x3; - x1^=x0; x1 = (x1<< 6)|(x1>>58); x0-=x1; -}; + if (R > 5) + { + x3 ^= x0; + x3 = (x3 << 18) | (x3 >> 46); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 52) | (x1 >> 12); + x2 -= x1; + }; -if (R>5){ - x3^=x0; x3 = (x3<<18)|(x3>>46); x0-=x3; - x1^=x2; x1 = (x1<<52)|(x1>>12); x2-=x1; -}; + if (R > 4) + { + x3 ^= x2; + x3 = (x3 << 31) | (x3 >> 33); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 39) | (x1 >> 25); + x0 -= x1; + }; -if (R>4){ - x3^=x2; x3 = (x3<<31)|(x3>>33); x2-=x3; - x1^=x0; x1 = (x1<<39)|(x1>>25); x0-=x1; -}; + if (R > 3) + { + //Anti-inject key 1 + x0 -= k1; + x1 -= k2; + x2 -= k3; + x3 -= k4; + x3 -= 1; + x3 ^= x0; + x3 = (x3 << 59) | (x3 >> 5); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 27) | (x1 >> 37); + x2 -= x1; + }; - if (R>3){ - //Anti-inject key 1 - x0-=k1; x1-=k2; x2-=k3; x3-=k4; x3-=1; - x3^=x0; x3 = (x3<<59)|(x3>>5); x0-=x3; - x1^=x2; x1 = (x1<<27)|(x1>>37); x2-=x1; -}; + if (R > 2) + { + x3 ^= x2; + x3 = (x3 << 24) | (x3 >> 40); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 41) | (x1 >> 23); + x0 -= x1; + }; -if (R>2){ - x3^=x2; x3 = (x3<<24)|(x3>>40); x2-=x3; - x1^=x0; x1 = (x1<<41)|(x1>>23); x0-=x1; -}; + if (R > 1) + { + x3 ^= x0; + x3 = (x3 << 12) | (x3 >> 52); + x0 -= x3; + x1 ^= x2; + x1 = (x1 << 7) | (x1 >> 57); + x2 -= x1; + }; + if (R > 0) + { + x3 ^= x2; + x3 = (x3 << 48) | (x3 >> 16); + x2 -= x3; + x1 ^= x0; + x1 = (x1 << 50) | (x1 >> 14); + x0 -= x1; + }; -if (R>1){ - x3^=x0; x3 = (x3<<12)|(x3>>52); x0-=x3; - x1^=x2; x1 = (x1<< 7)|(x1>>57); x2-=x1; -}; - -if (R>0){ - x3^=x2; x3 = (x3<<48)|(x3>>16); x2-=x3; - x1^=x0; x1 = (x1<<50)|(x1>>14); x0-=x1; - }; - - // Anti-start - x0-=k0; x1-=k1; x2-=k2; x3-=k3; - - -//--------------------------------------- - threefry4x64_ctr_t result = {{x0,x1,x2,x3}}; - return(result); -}; + // Anti-start + x0 -= k0; + x1 -= k1; + x2 -= k2; + x3 -= k3; + //--------------------------------------- + threefry4x64_ctr_t result = {{x0, x1, x2, x3}}; + return (result); +} //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/random_panphasia.cc b/src/plugins/random_panphasia.cc index e68e90e..0dfa0d3 100644 --- a/src/plugins/random_panphasia.cc +++ b/src/plugins/random_panphasia.cc @@ -162,6 +162,8 @@ void RNG_panphasia::Run_Panphasia_Highorder(Grid_FFT &g) Grid_FFT pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}}); + _unused(alloc_local); + assert(pan_grid.n_[0] == N0); assert(pan_grid.n_[1] == N0); assert(pan_grid.n_[2] == N0); From 3607afb244e0fafe37b4d69dc569f4bd54be9731 Mon Sep 17 00:00:00 2001 From: Oliver Hahn Date: Tue, 5 Oct 2021 23:00:26 +0200 Subject: [PATCH 25/25] check for negative masses with mass perturbations --- include/particle_generator.hh | 27 +++++++++++++++++++-------- src/ic_generator.cc | 3 +++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/particle_generator.hh b/include/particle_generator.hh index 3ff4379..53520b7 100644 --- a/include/particle_generator.hh +++ b/include/particle_generator.hh @@ -216,6 +216,10 @@ namespace particle const size_t num_p_in_load = field.local_size(); const real_t pmeanmass = munit / real_t(field.global_size()* overload); + bool bmass_negative = false; + auto mean_pm = field.mean() * pmeanmass; + auto std_pm = field.std() * pmeanmass; + for (int ishift = 0; ishift < (1 << lattice_type); ++ishift) { // if we are dealing with the secondary lattice, apply a global shift @@ -237,18 +241,25 @@ namespace particle { for (size_t k = 0; k < field.size(2); ++k) { - if (b64reals) - { - particles_.set_mass64(ipcount++, pmeanmass * field.relem(i, j, k)); - } - else - { - particles_.set_mass32(ipcount++, pmeanmass * field.relem(i, j, k)); - } + // get + const auto pmass = pmeanmass * field.relem(i, j, k); + + // check for negative mass + bmass_negative |= pmass<0.0; + + // set + if (b64reals) particles_.set_mass64(ipcount++, pmass); + else particles_.set_mass32(ipcount++, pmass); } } } } + + // diagnostics + music::ilog << "Particle Mass : mean/munit = " << mean_pm/munit << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl; + if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t the starting redshift or disabling baryon decaying modes." << std::endl; + if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t redshift or disable baryon decaying modes!" << std::endl; + }else{ // should not happen music::elog << "Cannot have individual particle masses for glasses!" << std::endl; diff --git a/src/ic_generator.cc b/src/ic_generator.cc index 8fd81c0..f419f81 100644 --- a/src/ic_generator.cc +++ b/src/ic_generator.cc @@ -836,6 +836,9 @@ int run( config_file& the_config ) } } + + music::ilog << "-------------------------------------------------------------------------------" << std::endl; + } return 0; }