1
0
Fork 0
mirror of https://github.com/cosmo-sims/monofonIC.git synced 2024-09-19 17:03:45 +02:00

Merged in develop (pull request #24)

Merge Panphasia development into master branch
This commit is contained in:
Oliver Hahn 2021-11-11 20:16:13 +00:00
commit 114b81051e
35 changed files with 10250 additions and 529 deletions

View file

@ -149,8 +149,11 @@ elseif("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU")
endif()
endif(ENABLE_PANPHASIA)
########################################################################################################################
# PANPHASIA HO (High-Order, new version)
# option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON)
########################################################################################################################
# INCLUDES
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/external)
# SOURCES
# get all the *.cc files in the subfolders
@ -170,7 +173,12 @@ if(ENABLE_PANPHASIA)
list (APPEND SOURCES
${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f
${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90
#${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c
${PROJECT_SOURCE_DIR}/external/panphasia_ho/high_order_panphasia_routines.c
${PROJECT_SOURCE_DIR}/external/panphasia_ho/pan_mpi_routines.c
${PROJECT_SOURCE_DIR}/external/panphasia_ho/uniform_rand_threefry4x64.c
)
# target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
endif()
# project configuration header
@ -246,6 +254,10 @@ if(ENABLE_PANPHASIA)
target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA")
endif(ENABLE_PANPHASIA)
# if(ENABLE_PANPHASIA_HO)
# target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO")
# endif(ENABLE_PANPHASIA_HO)
if(ENABLE_PLT)
target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT")
endif(ENABLE_PLT)
@ -272,4 +284,4 @@ if(ENABLE_GENERICIO)
include(${CMAKE_CURRENT_SOURCE_DIR}/external/genericio.cmake)
target_link_libraries(${PRGNAME} PRIVATE genericio::genericio_mpi)
target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_GENERICIO")
endif()
endif()

View file

@ -16,7 +16,7 @@ LPTorder = 3 # order of the LPT to be used (1,2 or 3)
DoBaryons = no # also do baryon ICs?
DoBaryonVrel = no # if doing baryons, incl. also relative velocity to linear order?
DoFixing = yes # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
DoFixing = no # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
DoInversion = no # invert phases (for paired simulations)
ParticleLoad = sc # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x)
@ -89,15 +89,16 @@ ztarget = 2.5 # target redshift for CLASS module, output at
##> NGenIC compatible random number generator module compatible with V. Springel's original code
## (https://www.h-its.org/2014/11/05/ngenic-code/) as well as the 2LPT code by Pueblas&Scoccmiarro
## (https://cosmo.nyu.edu/roman/2LPT/)
generator = NGENIC
seed = 12345
# generator = NGENIC
# seed = 12345
##> The PANPHASIA generator uses a plugin based on original code by A. Jenkins
## Warning: Before using this module, please make sure you read and agree to the distinct license
## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php
# generator = PANPHASIA
# descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
generator = PANPHASIA_HO
#descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
descriptor = [Panph6,L20,(424060,82570,148256),S1,CH-999,Auriga_100_vol2]
# PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512)
##> The MUSIC1 multi-scale random number generator is provided for convenience

35
external/panphasia_ho/LICENSE vendored Normal file
View file

@ -0,0 +1,35 @@
The code in this subdirectory is part of Adrian Jenkins' PANPHASIA packet,
obtained from here http://icc.dur.ac.uk/Panphasia.php
PANPHASIA is not published under the GPL but has its own proprietary license,
make sure to visit the website before using the PANPHASIA functionality of
MUSIC2 and register your name.
We reproduce the licensing requirements for PANPHASIA from the above website
as retrieved on 2020/08/23:
We make our software available for free but with a licence that includes the
condition that users make sure the phases of any new simulation volumes set up
using Panphasia are published.
We are happy to collaborate with others on improving the software and providing
support for languages other than fortran. Contact: A.R.Jenkins@durham.ac.uk
LICENCE:
You are licensed to use this software free of charge on condition that:
- you will publish the phase descriptors and reference Jenkins (13) for any new
simulations that use Panphasia phases. You will pass on this condition to others
for any software or data you make available publically or privately that makes
use of Panphasia.
- that you will ensure any publications using results derived from Panphasia will
be submitted as a final version to arXiv prior to or coincident with publication
in a journal.
- that you report any bugs in this software as soon as confirmed to
A.R.Jenkins@durham.ac.uk
- that you understand that the software comes with no warranty and that is your
responsibility to ensure that it is suitable for the purpose that you intend.
- that you agree to having your name and email address stored for an indefinite
period in the future electronically in a database as a record that you agreed
the licence conditions.

61
external/panphasia_ho/PAN_FFTW3.h vendored Normal file
View file

@ -0,0 +1,61 @@
// Define macros for FFTW3 to allow swapping
// between single/double precision FTs
// include CMake controlled configuration settings
#pragma once
#include "cmake_config.hh"
#if defined(USE_PRECISION_DOUBLE)
#define FOURIER_DOUBLE
#endif
#if defined(USE_PRECISION_LONGDOUBLE)
#error "PANPHASIA-high-order does not currently support long double precision"
#endif
#ifdef FOURIER_DOUBLE
#define FFTW_REAL double
#define FFTW_PLAN fftw_plan
#define FFTW_DESTROY_PLAN fftw_destroy_plan
#define FFTW_COMPLEX fftw_complex
#define FFTW_MALLOC fftw_malloc
#define FFTW_PLAN_DFT_1D fftw_plan_dft_1d
#define FFTW_PLAN_dft_3D fftw_plan_dft_3d
#define FFTW_EXECUTE fftw_execute
#define FFTW_DESTROY_PLAN fftw_destroy_plan
#define FFTW_FREE fftw_free
#define FFTW_ALLOC_COMPLEX fftw_alloc_complex
#define FFTW_MPI_LOCAL_SIZE_MANY fftw_mpi_local_size_many
#define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftw_mpi_local_size_many_transposed
#define FFTW_MPI_PLAN_MANY_TRANSPOSE fftw_mpi_plan_many_transpose
#define FFTW_MPI_EXECUTE_R2R fftw_mpi_execute_r2r
#define FFTW_PLAN_MANY_DFT fftw_plan_many_dft
#define FFTW_MPI_LOCAL_SIZE_3D fftw_mpi_local_size_3d
#define FFTW_MPI_PLAN_MANY_DTF fftw_mpi_plan_many_dft
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftw_mpi_plan_many_dft_r2c
#define FFTW_MPI_EXECUTE_DFT fftw_mpi_execute_dft
#define FFTW_MPI_EXECUTE_DFT_R2C fftw_mpi_execute_dft_r2c
#else
#define FFTW_REAL float
#define FFTW_PLAN fftwf_plan
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
#define FFTW_COMPLEX fftwf_complex
#define FFTW_MALLOC fftwf_malloc
#define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d
#define FFTW_PLAN_dft_3D fftwf_plan_dft_3d
#define FFTW_EXECUTE fftwf_execute
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
#define FFTW_FREE fftwf_free
#define FFTW_ALLOC_COMPLEX fftwf_alloc_complex
#define FFTW_MPI_LOCAL_SIZE_MANY fftwf_mpi_local_size_many
#define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftwf_mpi_local_size_many_transposed
#define FFTW_MPI_PLAN_MANY_TRANSPOSE fftwf_mpi_plan_many_transpose
#define FFTW_MPI_EXECUTE_R2R fftwf_mpi_execute_r2r
#define FFTW_PLAN_MANY_DFT fftwf_plan_many_dft
#define FFTW_MPI_LOCAL_SIZE_3D fftwf_mpi_local_size_3d
#define FFTW_MPI_PLAN_MANY_DTF fftwf_mpi_plan_many_dft
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftwf_mpi_plan_many_dft_r2c
#define FFTW_MPI_EXECUTE_DFT fftwf_mpi_execute_dft
#define FFTW_MPI_EXECUTE_DFT_R2C fftwf_mpi_execute_dft_r2c
#endif

143
external/panphasia_ho/README vendored Normal file
View file

@ -0,0 +1,143 @@
modules on COSMA7
intel_comp/2018 fftw/3.3.9cosma7
intel_mpi/2018 gsl/2.5
The code calls a function to generate the k-space modes for
a portion of the Panphasia field given an input descriptor.
Should be called early before significant memory is allocated. It
uses quite a bit of memory itself, but tidies up afterwards.
Has OpenMP - -DUSE_OPENMP in the makefile
The routines support both single and double precision
calculations in two senses.
The Fourier computations can be single or double precision
MACROs FFTW_REAL/FFTW_COMPLEX used to define 'Fourier' precision types
float or double.
The Panphasia coefficients can be single or double precision
MACROs PAN_REAL/PAN_COMPLEX define the Panphasia precision - either
float or double.
To change the Fourier precision edit PAN_FFTW3.h - by default
single precision unless 'FOURIER_DOUBLE' is defined.
To change the Panphasia precision edit panphasia_functions.h and
single precision unless 'PAN_DOUBLE_PRECISION' is defined.
Code description
-------------------
makefile
CODE
----
main.c - demo program only
pan_mpi_routines.c - contains MPI calls
high_order_panphasia_routines.c - serial - contains some OpenMP
uniform_rand_threefry4x64.c - serial - random generator and tests
Include files
--------------
panphasia_functions.h
PAN_FFTW3.h - MACROS for single/double precision FTs
pan_matrices_order6.h - matrix coefficients for 6th order scheme.
threefry.h - Random generator
array.h + features array .h files
Development notes:
----------------------------------------------------
14th April 2021
Found a bug in the OpenMP version. Different numbers
of threads led to a subset of Fourier modes having
different values. The precise differences changed
each time the code was run.
Debugged by turning of OpenMP section by section.
The section which uses the spherical bessel functions
turned out to be responsible.
The faulty version collapsed for 4 loops over
multipole,x,y,z. Changing this to a loop
over multipoles, and collapsing 3 coordinate
loops solved the problem.
The variable index1 of the return field
does not depend on the multipole, while
index2 does. Both index1 and index2 are
private. This means the return array
(index 1) is updated several times.
Presumably as these updated occur
in parallel with the 4 loop collapsed
version the return array was being
corrupted sometimes.
15th April
------------
This version supercedes version given to
Oliver to add to MonofonIC clone.
Main difference is additional OpenMP
statements and the ability to specify
in the descriptor that modes less
than of equal to some dimensionless
integer wavenumber squared are set
to the mean power.
Tested output on 1 core - with/without
OpenMP. Not tested with more than
1 MPI rank.

326
external/panphasia_ho/array.h vendored Normal file
View file

@ -0,0 +1,326 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _r123array_dot_h__
#define _r123array_dot_h__
#include "features/compilerfeatures.h"
#include "features/sse.h"
#ifndef __cplusplus
#define CXXMETHODS(_N, W, T)
#define CXXOVERLOADS(_N, W, T)
#else
#include <stddef.h>
#include <algorithm>
#include <stdexcept>
#include <iterator>
#include <limits>
#include <iostream>
/** @defgroup arrayNxW The r123arrayNxW classes
Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
It is functionally equivalent to the C++0x std::array<N, uintW_t>,
but does not require C++0x features or libraries.
In addition to meeting most of the requirements of a Container,
it also has a member function, incr(), which increments the zero-th
element and carrys overflows into higher indexed elements. Thus,
by using incr(), sequences of up to 2^(N*W) distinct values
can be produced.
If SSE is supported by the compiler, then the class
r123array1xm128i is also defined, in which the data member is an
array of one r123128i object.
@cond HIDDEN_FROM_DOXYGEN
*/
template <typename value_type>
inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
value_type v=0;
for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
v |= ((value_type)(*p32++)) << (32*i);
return v;
}
// Work-alike methods and typedefs modeled on std::array:
#define CXXMETHODS(_N, W, T) \
typedef T value_type; \
typedef T* iterator; \
typedef const T* const_iterator; \
typedef value_type& reference; \
typedef const value_type& const_reference; \
typedef size_t size_type; \
typedef ptrdiff_t difference_type; \
typedef T* pointer; \
typedef const T* const_pointer; \
typedef std::reverse_iterator<iterator> reverse_iterator; \
typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
/* Boost.array has static_size. C++11 specializes tuple_size */ \
enum {static_size = _N}; \
R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
R123_CUDA_DEVICE size_type size() const { return _N; } \
R123_CUDA_DEVICE size_type max_size() const { return _N; } \
R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
R123_CUDA_DEVICE iterator begin() { return &v[0]; } \
R123_CUDA_DEVICE iterator end() { return &v[_N]; } \
R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \
R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \
R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \
R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \
R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \
R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \
R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
R123_CUDA_DEVICE pointer data(){ return &v[0]; } \
R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \
R123_CUDA_DEVICE reference front(){ return v[0]; } \
R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \
R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \
R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \
R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
/* CUDA3 does not have std::equal */ \
for (size_t i = 0; i < _N; ++i) \
if (v[i] != rhs.v[i]) return false; \
return true; \
} \
R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
/* CUDA3 does not have std::fill_n */ \
R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
/* CUDA3 does not have std::swap_ranges */ \
for (size_t i = 0; i < _N; ++i) { \
T tmp = v[i]; \
v[i] = rhs.v[i]; \
rhs.v[i] = tmp; \
} \
} \
R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \
/* This test is tricky because we're trying to avoid spurious \
complaints about illegal shifts, yet still be compile-time \
evaulated. */ \
if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
return incr_carefully(n); \
if(n==1){ \
++v[0]; \
if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \
}else{ \
v[0] += n; \
if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \
} \
/* We expect that the N==?? tests will be \
constant-folded/optimized away by the compiler, so only the \
overflow tests (!!v[i]) remain to be done at runtime. For \
small values of N, it would be better to do this as an \
uncondtional sequence of adc. An experiment/optimization \
for another day... \
N.B. The weird subscripting: v[_N>3?3:0] is to silence \
a spurious error from icpc \
*/ \
++v[_N>1?1:0]; \
if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
++v[_N>2?2:0]; \
if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \
++v[_N>3?3:0]; \
for(size_t i=4; i<_N; ++i){ \
if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \
++v[i]; \
} \
return *this; \
} \
/* seed(SeedSeq) would be a constructor if having a constructor */ \
/* didn't cause headaches with defaults */ \
template <typename SeedSeq> \
R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \
r123array##_N##x##W ret; \
const size_t Ngen = _N*((3+sizeof(value_type))/4); \
uint32_t u32[Ngen]; \
uint32_t *p32 = &u32[0]; \
ss.generate(&u32[0], &u32[Ngen]); \
for(size_t i=0; i<_N; ++i){ \
ret.v[i] = assemble_from_u32<value_type>(p32); \
p32 += (3+sizeof(value_type))/4; \
} \
return ret; \
} \
protected: \
R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
/* n may be greater than the maximum value of a single value_type */ \
value_type vtn; \
vtn = n; \
v[0] += n; \
const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
for(size_t i=1; i<_N; ++i){ \
if(rshift){ \
n >>= rshift; \
}else{ \
n=0; \
} \
if( v[i-1] < vtn ) \
++n; \
if( n==0 ) break; \
vtn = n; \
v[i] += n; \
} \
return *this; \
} \
// There are several tricky considerations for the insertion and extraction
// operators:
// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
// not as 16 bytes.
// - we would like to be able to print r123array1xm128i.
// - we do not want an int conversion operator in r123m128i because it causes
// lots of ambiguity problems with automatic promotions.
// Solution: r123arrayinsertable and r123arrayextractable
template<typename T>
struct r123arrayinsertable{
const T& v;
r123arrayinsertable(const T& t_) : v(t_) {}
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
return os << t.v;
}
};
template<>
struct r123arrayinsertable<uint8_t>{
const uint8_t& v;
r123arrayinsertable(const uint8_t& t_) : v(t_) {}
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
return os << (int)t.v;
}
};
template<typename T>
struct r123arrayextractable{
T& v;
r123arrayextractable(T& t_) : v(t_) {}
friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
return is >> t.v;
}
};
template<>
struct r123arrayextractable<uint8_t>{
uint8_t& v;
r123arrayextractable(uint8_t& t_) : v(t_) {}
friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
int i;
is >> i;
t.v = i;
return is;
}
};
#define CXXOVERLOADS(_N, W, T) \
\
inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \
os << r123arrayinsertable<T>(a.v[0]); \
for(size_t i=1; i<_N; ++i) \
os << " " << r123arrayinsertable<T>(a.v[i]); \
return os; \
} \
\
inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \
for(size_t i=0; i<_N; ++i){ \
r123arrayextractable<T> x(a.v[i]); \
is >> x; \
} \
return is; \
} \
\
namespace r123{ \
typedef r123array##_N##x##W Array##_N##x##W; \
}
#endif /* __cplusplus */
/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
In C, it's nothing more than a struct containing an array of N
objects of type T.
In C++ it's the same, but endowed with an assortment of member
functions, typedefs and friends. In C++, r123arrayNxW looks a lot
like std::array<T,N>, has most of the capabilities of a container,
and satisfies the requirements outlined in compat/Engine.hpp for
counter and key types. ArrayNxW, in the r123 namespace is
a typedef equivalent to r123arrayNxW.
*/
#define _r123array_tpl(_N, W, T) \
/** @ingroup arrayNxW */ \
/** @see arrayNxW */ \
struct r123array##_N##x##W{ \
T v[_N]; \
CXXMETHODS(_N, W, T) \
}; \
\
CXXOVERLOADS(_N, W, T)
/** @endcond */
_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */
_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
#if R123_USE_SSE
_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
#endif
/* In C++, it's natural to use sizeof(a::value_type), but in C it's
pretty convoluted to figure out the width of the value_type of an
r123arrayNxW:
*/
#define R123_W(a) (8*sizeof(((a *)0)->v[0]))
/** @namespace r123
Most of the Random123 C++ API is contained in the r123 namespace.
*/
#endif

33
external/panphasia_ho/c7_script_threaded vendored Executable file
View file

@ -0,0 +1,33 @@
#!/bin/bash -l
#SBATCH --ntasks 5
#SBATCH -J Test_MPI_FFTW
#SBATCH -o standard_output_file.%J.out
#SBATCH -e standard_error_file.%J.err
#SBATCH -p cosma7
#SBATCH -A dp004
#SBATCH --exclusive
#SBATCH -t 00:05:00
#SBATCH --mail-type=END # notifications for job
#SBATCH --mail-user=a.r.jenkins@durham.ac.uk
module purge
module load intel_comp/2018 intel_mpi/2018 fftw/3.3.9cosma7 gsl/2.5 hdf5/1.8.20
# Run the program
mpirun -l -env I_MPI_PIN=1 -env I_MPI_PIN_PROCESSOR_LIST=allcores -n $SLURM_NTASKS ./pan_fftw3_test_code.x

View file

@ -0,0 +1,93 @@
/*
Copyright 2010-2016, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __clangfeatures_dot_hpp
#define __clangfeatures_dot_hpp
#ifndef R123_USE_X86INTRIN_H
#if (defined(__x86_64__)||defined(__i386__))
#define R123_USE_X86INTRIN_H 1
#else
#define R123_USE_X86INTRIN_H 0
#endif
#endif
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
#endif
#ifndef R123_USE_CXX11_STATIC_ASSERT
#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
#endif
// With clang-3.6, -Wall warns about unused-local-typedefs.
// The "obvious" thing to do is to ignore -Wunused-local-typedefs,
// but that doesn't work because earlier versions of clang blow
// up on an 'unknown warning group'. So we briefly ignore -Wall...
// It's tempting to just give up on static assertions in pre-c++11 code.
#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT)
#define R123_STATIC_ASSERT(expr, msg) \
_Pragma("clang diagnostic push") \
_Pragma("clang diagnostic ignored \"-Wall\"") \
typedef char static_assertion[(!!(expr))*2-1] \
_Pragma("clang diagnostic pop")
#endif
#ifndef R123_USE_CXX11_CONSTEXPR
#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
#endif
// With clang-3.0, the apparently simpler:
// #define R123_USE_CXX11_RANDOM __has_include(<random>)
// dumps core.
#ifndef R123_USE_CXX11_RANDOM
#if __cplusplus>=201103L && __has_include(<random>)
#define R123_USE_CXX11_RANDOM 1
#else
#define R123_USE_CXX11_RANDOM 0
#endif
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#if __cplusplus>=201103L && __has_include(<type_traits>)
#define R123_USE_CXX11_TYPE_TRAITS 1
#else
#define R123_USE_CXX11_TYPE_TRAITS 0
#endif
#endif
#include "gccfeatures.h"
#endif

View file

@ -0,0 +1,343 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
@page porting Preprocessor symbols for porting Random123 to different platforms.
The Random123 library is portable across C, C++, CUDA, OpenCL environments,
and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
This level of portability requires the abstraction of some features
and idioms that are either not standardized (e.g., asm statments), or for which
different vendors have their own standards (e.g., SSE intrinsics) or for
which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
Random123/features/compilerfeatures.h
conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
defines appropriate values for the preprocessor symbols which can be used with
a specific compiler or OS. Those symbols will then
be used by other header files and source files in the Random123
library (and may be used by applications) to control what actually
gets presented to the compiler.
Most of the symbols are boolean valued. In general, they will
\b always be defined with value either 1 or 0, so do
\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead.
Library users can override any value by defining the pp-symbol with a compiler option,
e.g.,
cc -DR123_USE_MULHILO64_C99
will use a strictly c99 version of the full-width 64x64->128-bit multiplication
function, even if it would be disabled by default.
All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
@verbatim
AES_NI
AES_OPENSSL
SSE4_2
SSE4_1
SSE
STD_RANDOM
GNU_UINT128
ASM_GNU
ASM_MSASM
CPUID_MSVC
CXX11_RANDOM
CXX11_TYPE_TRAITS
CXX11_STATIC_ASSERT
CXX11_CONSTEXPR
CXX11_UNRESTRICTED_UNIONS
CXX11_EXPLICIT_CONVERSIONS
CXX11_LONG_LONG
CXX11_STD_ARRAY
CXX11
X86INTRIN_H
IA32INTRIN_H
XMMINTRIN_H
EMMINTRIN_H
SMMINTRIN_H
WMMINTRIN_H
INTRIN_H
MULHILO32_ASM
MULHILO64_ASM
MULHILO64_MSVC_INTRIN
MULHILO64_CUDA_INTRIN
MULHILO64_OPENCL_INTRIN
MULHILO64_C99
U01_DOUBLE
@endverbatim
Most have obvious meanings. Some non-obvious ones:
AES_NI and AES_OPENSSL are not mutually exclusive. You can have one,
both or neither.
GNU_UINT128 says that it's safe to use __uint128_t, but it
does not require its use. In particular, it should be
used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
If the XXXINTRIN_H macros are true, then one should
@code
#include <xxxintrin.h>
@endcode
to gain accesss to compiler intrinsics.
The CXX11_SOME_FEATURE macros allow the code to use specific
features of the C++11 language and library. The catchall
In the absence of a specific CXX11_SOME_FEATURE, the feature
is controlled by the catch-all R123_USE_CXX11 macro.
U01_DOUBLE defaults on, and can be turned off (set to 0)
if one does not want the utility functions that convert to double
(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
There are a number of invariants that are always true. Application code may
choose to rely on these:
<ul>
<li>ASM_GNU and ASM_MASM are mutually exclusive
<li>The "higher" SSE values imply the lower ones.
</ul>
There are also non-boolean valued symbols:
<ul>
<li>R123_STATIC_INLINE -
According to both C99 and GNU99, the 'static inline' declaration allows
the compiler to not emit code if the function is not used.
Note that the semantics of 'inline', 'static' and 'extern' in
gcc have changed over time and are subject to modification by
command line options, e.g., -std=gnu89, -fgnu-inline.
Nevertheless, it appears that the meaning of 'static inline'
has not changed over time and (with a little luck) the use of 'static inline'
here will be portable between versions of gcc and to other C99
compilers.
See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
http://www.greenend.org.uk/rjk/2003/03/inline.html
<li>R123_FORCE_INLINE(decl) -
which expands to 'decl', adorned with the compiler-specific
embellishments to strongly encourage that the declared function be
inlined. If there is no such compiler-specific magic, it should
expand to decl, unadorned.
<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
sufficiently similar semantics) when CUDA is in use, and expands
to nothing in other cases.
<li>R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or
something else with sufficiently similar semantics) when compiling a
Metal kernel, and expands to nothing in other cases.
<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
all if we're in an environment so feature-poor that you can't even
call assert (I'm looking at you, CUDA and OpenCL), or even include
assert.h safely (OpenCL).
<li>R123_STATIC_ASSERT(expr,msg) - which expands to
static_assert(expr,msg), or to an expression that
will raise a compile-time exception if expr is not true.
<li>R123_ULONG_LONG - which expands to a declaration of the longest available
unsigned integer.
<li>R123_64BIT(x) - expands to something equivalent to
UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
is not available, e.g., MSVC and OpenCL.
<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
the semantics of gcc's __builtin_expect(expr,likely_value). If
the environment has nothing like __builtin_expect, it should expand
to just expr.
</ul>
\cond HIDDEN_FROM_DOXYGEN
*/
/*
N.B. When something is added to the list of features, it should be
added to each of the *features.h files, AND to examples/ut_features.cpp.
*/
/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
#if defined(__METAL_MACOS__)
#include "metalfeatures.h"
#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
#include "openclfeatures.h"
#elif defined(__CUDACC__)
#include "nvccfeatures.h"
#elif defined(__ICC)
#include "iccfeatures.h"
#elif defined(__xlC__) || defined(__ibmxl__)
#include "xlcfeatures.h"
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
#include "sunprofeatures.h"
#elif defined(__OPEN64__)
#include "open64features.h"
#elif defined(__clang__)
#include "clangfeatures.h"
#elif defined(__GNUC__)
#include "gccfeatures.h"
#elif defined(__PGI)
#include "pgccfeatures.h"
#elif defined(_MSC_FULL_VER)
#include "msvcfeatures.h"
#else
#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp"
{ /* maybe an unbalanced brace will terminate the compilation */
#endif
#ifndef R123_USE_CXX11
#define R123_USE_CXX11 (__cplusplus >= 201103L)
#endif
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_STATIC_ASSERT
#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_CONSTEXPR
#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_RANDOM
#define R123_USE_CXX11_RANDOM R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_LONG_LONG
#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
#endif
#ifndef R123_USE_CXX11_STD_ARRAY
#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11
#endif
#ifndef R123_USE_MULHILO64_C99
#define R123_USE_MULHILO64_C99 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef R123_STATIC_ASSERT
#if R123_USE_CXX11_STATIC_ASSERT
#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
#else
/* if msg always_looked_like_this, we could paste it into the name. Worth it? */
#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
#endif
#endif
#ifndef R123_CONSTEXPR
#if R123_USE_CXX11_CONSTEXPR
#define R123_CONSTEXPR constexpr
#else
#define R123_CONSTEXPR
#endif
#endif
#ifndef R123_USE_64BIT
#define R123_USE_64BIT 1
#endif
#ifndef R123_USE_PHILOX_64BIT
#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN))
#endif
#ifndef R123_ULONG_LONG
#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
/* C++98 doesn't have long long. It doesn't have uint64_t either, but
we will have typedef'ed uint64_t to something in the xxxfeatures.h.
With luck, it won't elicit complaints from -pedantic. Cross your
fingers... */
#define R123_ULONG_LONG uint64_t
#else
#define R123_ULONG_LONG unsigned long long
#endif
#endif
/* UINT64_C should have been #defined by XXXfeatures.h, either by
#include <stdint.h> or through compiler-dependent hacks */
#ifndef R123_64BIT
#define R123_64BIT(x) UINT64_C(x)
#endif
#ifndef R123_THROW
#define R123_THROW(x) throw (x)
#endif
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
#define R123_METAL_THREAD_ADDRESS_SPACE
#endif
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
#define R123_METAL_CONSTANT_ADDRESS_SPACE
#endif
/*
* Windows.h (and perhaps other "well-meaning" code define min and
* max, so there's a high chance that our definition of min, max
* methods or use of std::numeric_limits min and max will cause
* complaints in any program that happened to include Windows.h or
* suchlike first. We use the null macro below in our own header
* files definition or use of min, max to defensively preclude
* this problem. It may not be enough; one might need to #define
* NOMINMAX before including Windows.h or compile with -DNOMINMAX.
*/
#define R123_NO_MACRO_SUBST
/** \endcond */

View file

@ -0,0 +1,263 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __gccfeatures_dot_hpp
#define __gccfeatures_dot_hpp
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390x__)
# error "This code has only been tested on x86, powerpc and a few arm platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __powerpc__
#include <ppu_intrinsics.h>
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static __inline__
#endif
#ifndef R123_FORCE_INLINE
#if R123_GNUC_VERSION >= 40000
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#else
#define R123_FORCE_INLINE(decl) decl
#endif
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
/* According to the C++0x standard, we should be able to test the numeric
value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11
But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
since early 2001, which was finally fixed in 4.7 (early 2012). For
earlier versions, the only way to detect whether --std=c++0x was requested
on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
*/
#if defined(__GCC_EXPERIMENTAL_CXX0X__)
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
#else
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
#endif
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_STATIC_ASSERT
#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_CONSTEXPR
#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_RANDOM
#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
#endif
#ifndef R123_USE_CXX11_TYPE_TRAITS
#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
#endif
#ifndef R123_USE_AES_NI
#ifdef __AES__
#define R123_USE_AES_NI 1
#else
#define R123_USE_AES_NI 0
#endif
#endif
#ifndef R123_USE_SSE4_2
#ifdef __SSE4_2__
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#ifdef __SSE4_1__
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#if defined(__x86_64__) || defined(__aarch64__)
#define R123_USE_GNU_UINT128 1
#else
#define R123_USE_GNU_UINT128 0
#endif
#endif
#ifndef R123_USE_ASM_GNU
#if (defined(__x86_64__)||defined(__i386__))
#define R123_USE_ASM_GNU 1
#else
#define R123_USE_ASM_GNU 1
#endif
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#if (defined(__x86_64__)||defined(__i386__))
#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
#else
#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
#endif
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
emmintrin.h in the include search path. This is
so broken that I refuse to try to work around it. If this
affects you, figure out where your emmintrin.h lives and
add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */
#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#if (defined(__powerpc64__))
#define R123_USE_MULHILO64_MULHI_INTRIN 1
#else
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#endif
#ifndef R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef R123_MULHILO32_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif

View file

@ -0,0 +1,212 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __icpcfeatures_dot_hpp
#define __icpcfeatures_dot_hpp
// icc relies on gcc libraries and other toolchain components.
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Easy123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_SSE4_2
#ifdef __SSE4_2__
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#ifdef __SSE4_1__
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_NI
// Unlike gcc, icc (version 12) does not pre-define an __AES__
// pp-symbol when -maes or -xHost is on the command line. This feels
// like a defect in icc (it defines __SSE4_2__ in analogous
// circumstances), but until Intel fixes it, we're better off erring
// on the side of caution and not generating instructions that are
// going to raise SIGILL when executed. To get the AES-NI
// instructions with icc, the caller must puts something like
// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
// 11.1 onwards.
//
#if defined(__AES__)
#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/)
#else
#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/)
#endif
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 1
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 1
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 1
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif

View file

@ -0,0 +1,111 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Written by Tom Schoonjans <Tom.Schoonjans@me.com>
*/
#ifndef __metalfeatures_dot_hpp
#define __metalfeatures_dot_hpp
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
#define R123_METAL_THREAD_ADDRESS_SPACE thread
#endif
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
#define R123_METAL_CONSTANT_ADDRESS_SPACE constant
#endif
#ifndef R123_ASSERT
#define R123_ASSERT(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 1
#endif
#if R123_USE_MULHILO32_MULHI_INTRIN
#include <metal_integer>
#define R123_MULHILO32_MULHI_INTRIN metal::mulhi
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_64BIT
#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */
#endif
#ifndef R123_ULONG_LONG
/* the longest integer type in Metal (Feb 2019, Specification-2) is a
* 32-bit unsigned int. Let's hope for the best... */
#define R123_ULONG_LONG unsigned int
#endif
#endif

View file

@ -0,0 +1,200 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __msvcfeatures_dot_hpp
#define __msvcfeatures_dot_hpp
//#if _MSVC_FULL_VER <= 15
//#error "We've only tested MSVC_FULL_VER==15."
//#endif
#if !defined(_M_IX86) && !defined(_M_X64)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Random123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static __inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) _forceinline decl
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_AES_NI
#if defined(_M_X64)
#define R123_USE_AES_NI 1
#else
#define R123_USE_AES_NI 0
#endif
#endif
#ifndef R123_USE_SSE4_2
#if defined(_M_X64)
#define R123_USE_SSE4_2 1
#else
#define R123_USE_SSE4_2 0
#endif
#endif
#ifndef R123_USE_SSE4_1
#if defined(_M_X64)
#define R123_USE_SSE4_1 1
#else
#define R123_USE_SSE4_1 0
#endif
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 1
#endif
#ifndef R123_USE_AES_OPENSSL
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 0
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 1
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 1
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 1
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 1
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#if defined(_M_X64)
#define R123_USE_MULHILO64_MSVC_INTRIN 1
#else
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
#pragma warning(disable:4244)
#pragma warning(disable:4996)
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif

View file

@ -0,0 +1,125 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __r123_nvcc_features_dot_h__
#define __r123_nvcc_features_dot_h__
#if !defined(CUDART_VERSION)
#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
#endif
#if CUDART_VERSION < 4010
#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
// This test was added in Random123-1.08 (August, 2013) because we
// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
// earlier. We can't be sure this bug doesn't also affect invocations
// of other templated functions, e.g., essentially all of Random123.
// Thus, we no longer trust CUDA versions earlier than 4.1 even though
// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
// If you feel lucky or desperate, you can change #error to #warning, but
// please take extra care to be sure that you are getting correct
// results.
#endif
// nvcc falls through to gcc or msvc. So first define
// a couple of things and then include either gccfeatures.h
// or msvcfeatures.h
//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
//for both device and host functions in CUDA by setting compiler flags
//for the device function
#ifdef __CUDA_ARCH__
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE __device__
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 1
#endif
#ifndef R123_THROW
// No exceptions in CUDA, at least upto 4.0
#define R123_THROW(x) R123_ASSERT(0)
#endif
#ifndef R123_ASSERT
#define R123_ASSERT(x) if((x)) ; else asm("trap;")
#endif
#else // ! __CUDA_ARCH__
// If we're using nvcc not compiling for the CUDA architecture,
// then we must be compiling for the host. In that case,
// tell the philox code to use the mulhilo64 asm because
// nvcc doesn't grok uint128_t.
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#endif // __CUDA_ARCH__
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_ULONG_LONG
// uint64_t, which is what we'd get without this, is
// not the same as unsigned long long
#define R123_ULONG_LONG unsigned long long
#endif
#if defined(__GNUC__)
#include "gccfeatures.h"
#elif defined(_MSC_FULL_VER)
#include "msvcfeatures.h"
#endif
#endif

View file

@ -0,0 +1,50 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __open64features_dot_hpp
#define __open64features_dot_hpp
/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
but produce incorrect code for 64-bit philox. The MULHILO64_ASM
seems to work fine */
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#include "gccfeatures.h"
#endif

View file

@ -0,0 +1,89 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __openclfeatures_dot_hpp
#define __openclfeatures_dot_hpp
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#define R123_ASSERT(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 1
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
// ulong to mul_hi. And gets lots of complaints from stdint.h
// on some machines.
// But these typedefs mean we cannot include stdint.h with
// these headers? Do we need R123_64T, R123_32T, R123_8T?
typedef ulong uint64_t;
typedef uint uint32_t;
typedef uchar uint8_t;
#define UINT64_C(x) ((ulong)(x##UL))
#endif

View file

@ -0,0 +1,194 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __pgccfeatures_dot_hpp
#define __pgccfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
/* Found this example in PGI's emmintrin.h. */
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) (expr)
#endif
/* PGI through 13.2 doesn't appear to support AES-NI. */
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
ABM, but not SSE4.1 or SSE4.2. */
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define R123_USE_SSE 1
#else
#define R123_USE_SSE 0
#endif
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
about undefined references to _mm_castsi128_ps(__m128i). Why? */
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 1
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 1
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#ifdef __ABM__
#define R123_USE_INTRIN_H 1
#else
#define R123_USE_INTRIN_H 0
#endif
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 1
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif

280
external/panphasia_ho/features/sse.h vendored Normal file
View file

@ -0,0 +1,280 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _Random123_sse_dot_h__
#define _Random123_sse_dot_h__
#if R123_USE_SSE
#if R123_USE_X86INTRIN_H
#include <x86intrin.h>
#endif
#if R123_USE_IA32INTRIN_H
#include <ia32intrin.h>
#endif
#if R123_USE_XMMINTRIN_H
#include <xmmintrin.h>
#endif
#if R123_USE_EMMINTRIN_H
#include <emmintrin.h>
#endif
#if R123_USE_SMMINTRIN_H
#include <smmintrin.h>
#endif
#if R123_USE_WMMINTRIN_H
#include <wmmintrin.h>
#endif
#if R123_USE_INTRIN_H
#include <intrin.h>
#endif
#ifdef __cplusplus
#include <iostream>
#include <limits>
#include <stdexcept>
#endif
#if R123_USE_ASM_GNU
/* bit25 of CX tells us whether AES is enabled. */
R123_STATIC_INLINE int haveAESNI(){
unsigned int eax, ebx, ecx, edx;
__asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
"a" (1));
return (ecx>>25) & 1;
}
#elif R123_USE_CPUID_MSVC
R123_STATIC_INLINE int haveAESNI(){
int CPUInfo[4];
__cpuid(CPUInfo, 1);
return (CPUInfo[2]>>25)&1;
}
#else /* R123_USE_CPUID_??? */
#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
R123_STATIC_INLINE int haveAESNI(){
return 0;
}
#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
// There is a lot of annoying and inexplicable variation in the
// SSE intrinsics available in different compilation environments.
// The details seem to depend on the compiler, the version and
// the target architecture. Rather than insisting on
// R123_USE_feature tests for each of these in each of the
// compilerfeatures.h files we just keep the complexity localized
// to here...
#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
added _mm_set_epi64x to icc version 12.1 in Jan 2012.
*/
R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
union{
uint64_t u64;
uint32_t u32[2];
} u1, u0;
u1.u64 = v1;
u0.u64 = v0;
return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
}
#endif
/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
assertions in ut_M128.cpp and ut_carray.cpp when we use the
_mm_cvtsi128_si64 intrinsic. (See
https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
Finally, even if the intrinsic exists, it may be spelled with or
without the 'x'.
*/
#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_store_si128(&u.m, si);
return u.u64[0];
}
#elif defined(__llvm__) || defined(__ICC)
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64(si);
}
#else /* GNUC, others */
/* FWIW, gcc's emmintrin.h has had the 'x' spelling
since at least gcc-3.4.4. The no-'x' spelling showed up
around 4.2. */
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64x(si);
}
#endif
#if defined(__GNUC__) && __GNUC__ < 4
/* the cast builtins showed up in gcc4. */
R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
return (__m128)si;
}
#endif
#ifdef __cplusplus
struct r123m128i{
__m128i m;
#if R123_USE_CXX11_UNRESTRICTED_UNIONS
// C++98 forbids a union member from having *any* constructors.
// C++11 relaxes this, and allows union members to have constructors
// as long as there is a "trivial" default construtor. So in C++11
// we can provide a r123m128i constructor with an __m128i argument, and still
// have the default (and hence trivial) default constructor.
r123m128i() = default;
r123m128i(__m128i _m): m(_m){}
#endif
r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
// With C++11 we can attach explicit to the bool conversion operator
// to disambiguate undesired promotions. For g++, this works
// only in 4.5 and above.
explicit operator bool() const {return _bool();}
#else
// Pre-C++11, we have to do something else. Google for the "safe bool"
// idiom for other ideas...
operator const void*() const{return _bool()?this:0;}
#endif
operator __m128i() const {return m;}
private:
#if R123_USE_SSE4_1
bool _bool() const{ return !_mm_testz_si128(m,m); }
#else
bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
#endif
};
R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
__m128i& c = v.m;
__m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
c = _mm_add_epi64(c, zeroone);
//return c;
#if R123_USE_SSE4_1
__m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
__m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
c = _mm_add_epi64(c, onezero);
}
#else
unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
// The low two bits of mask are 11 iff the low 64 bits of
// c are zero.
if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
__m128i onezero = _mm_set_epi64x(1,0);
c = _mm_add_epi64(c, onezero);
}
#endif
return v;
}
R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
__m128i c = lhs.m;
__m128i incr128 = _mm_set_epi64x(0, n);
c = _mm_add_epi64(c, incr128);
// return c; // NO CARRY!
int64_t lo64 = _mm_extract_lo64(c);
if((uint64_t)lo64 < n)
c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
lhs.m = c;
return lhs;
}
// We need this one because it's present, but never used in r123array1xm128i::incr
R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
// The comparisons aren't implemented, but if we leave them out, and
// somebody writes, e.g., M1 < M2, the compiler will do an implicit
// conversion through void*. Sigh...
R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
return !(lhs==rhs);}
R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
return !(lhs==rhs);}
R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_storeu_si128(&u.m, m.m);
return os << u.u64[0] << " " << u.u64[1];
}
R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
uint64_t u64[2];
is >> u64[0] >> u64[1];
m.m = _mm_set_epi64x(u64[1], u64[0]);
return is;
}
template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
template <>
inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
r123m128i ret;
ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
return ret;
}
#else
typedef struct {
__m128i m;
} r123m128i;
#endif /* __cplusplus */
#else /* !R123_USE_SSE */
R123_STATIC_INLINE int haveAESNI(){
return 0;
}
#endif /* R123_USE_SSE */
#endif /* _Random123_sse_dot_h__ */

View file

@ -0,0 +1,172 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __sunprofeatures_dot_hpp
#define __sunprofeatures_dot_hpp
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef R123_SOMETHING
// #if some condition
// #define R123_SOMETHING 1
// #else
// #define R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
// An alternative idiom is:
// #ifndef R123_SOMETHING
// #define R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
// pp-symbols.
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_AES_OPENSSL
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 0
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 0
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#define R123_USE_INTRIN_H 0
#endif
#ifndef R123_USE_MULHILO16_ASM
#define R123_USE_MULHILO16_ASM 0
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_ASM
#define R123_USE_MULHILO64_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef R123_USE_PHILOX_64BIT
#define R123_USE_PHILOX_64BIT 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif

View file

@ -0,0 +1,210 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __xlcfeatures_dot_hpp
#define __xlcfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
# error "This code has only been tested on x86 and PowerPC platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __cplusplus
/* builtins are automatically available to xlc. To use them with xlc++,
one must include builtins.h. c.f
http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
*/
#include <builtins.h>
#endif
#ifndef R123_STATIC_INLINE
#define R123_STATIC_INLINE static inline
#endif
#ifndef R123_FORCE_INLINE
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef R123_CUDA_DEVICE
#define R123_CUDA_DEVICE
#endif
#ifndef R123_ASSERT
#include <assert.h>
#define R123_ASSERT(x) assert(x)
#endif
#ifndef R123_BUILTIN_EXPECT
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
#ifndef R123_USE_AES_NI
#define R123_USE_AES_NI 0
#endif
#ifndef R123_USE_SSE4_2
#define R123_USE_SSE4_2 0
#endif
#ifndef R123_USE_SSE4_1
#define R123_USE_SSE4_1 0
#endif
#ifndef R123_USE_SSE
#define R123_USE_SSE 0
#endif
#ifndef R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define R123_USE_AES_OPENSSL 0
#endif
#ifndef R123_USE_GNU_UINT128
#define R123_USE_GNU_UINT128 0
#endif
#ifndef R123_USE_ASM_GNU
#define R123_USE_ASM_GNU 1
#endif
#ifndef R123_USE_CPUID_MSVC
#define R123_USE_CPUID_MSVC 0
#endif
#ifndef R123_USE_X86INTRIN_H
#define R123_USE_X86INTRIN_H 0
#endif
#ifndef R123_USE_IA32INTRIN_H
#define R123_USE_IA32INTRIN_H 0
#endif
#ifndef R123_USE_XMMINTRIN_H
#define R123_USE_XMMINTRIN_H 0
#endif
#ifndef R123_USE_EMMINTRIN_H
#define R123_USE_EMMINTRIN_H 0
#endif
#ifndef R123_USE_SMMINTRIN_H
#define R123_USE_SMMINTRIN_H 0
#endif
#ifndef R123_USE_WMMINTRIN_H
#define R123_USE_WMMINTRIN_H 0
#endif
#ifndef R123_USE_INTRIN_H
#ifdef __ABM__
#define R123_USE_INTRIN_H 1
#else
#define R123_USE_INTRIN_H 0
#endif
#endif
#ifndef R123_USE_MULHILO32_ASM
#define R123_USE_MULHILO32_ASM 0
#endif
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
#if (defined(__powerpc64__))
#define R123_USE_MULHILO64_MULHI_INTRIN 1
#else
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#endif
#ifndef R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
#define R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef R123_MULHILO32_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef R123_USE_MULHILO64_ASM
#if defined(__powerpc64__)
#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
#else
#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
#endif
#endif
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
#define R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
#define R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif

File diff suppressed because it is too large Load diff

147
external/panphasia_ho/main.c vendored Normal file
View file

@ -0,0 +1,147 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <complex.h>
#include <fftw3-mpi.h>
#include "PAN_FFTW3.h"
#include "panphasia_functions.h"
extern size_t descriptor_base_size;
#ifdef USE_OPENMP
#include <omp.h>
int threads_ok;
int number_omp_threads = 1;
#endif
#if 0 // this is now unused since all this has been migrated to plugin!
// does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC)
int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load)
{
int verbose = 0;
int error;
size_t x0 = 0, y0 = 0, z0 = 0;
size_t rel_level;
int fdim=1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
//char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
PANPHASIA_init_descriptor_(descriptor, &verbose);
printf("Descriptor %s\n ngrid_load %llu\n",descriptor,*ngrid_load);
// Choose smallest value of level to equal of exceed *ngrid_load)
for (rel_level=0; fdim*(descriptor_base_size<<(rel_level+1))<=*ngrid_load; rel_level++);
printf("Setting relative level = %llu\n",rel_level);
if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
{
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
abort();
};
//======================= FFTW ==============================
ptrdiff_t alloc_local, local_n0, local_0_start;
ptrdiff_t N0 = fdim*(descriptor_base_size << rel_level);
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 +2 , MPI_COMM_WORLD, &local_n0, &local_0_start);
FFTW_COMPLEX *Panphasia_White_Noise_Field;
Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
{
printf("Error code from PANPHASIA_compute ... %d\n", error);
};
fftw_free(Panphasia_White_Noise_Field);
return(0);
}
#endif
#ifdef STANDALONE_PANPHASIA_HO
int main(int argc, char **argv)
{
int verbose = 0;
int error;
size_t x0 = 0, y0 = 0, z0 = 0;
size_t rel_level;
char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
#ifdef USE_OPENMP
omp_set_num_threads(number_omp_threads);
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
threads_ok = provided >= MPI_THREAD_FUNNELED;
if (threads_ok)
threads_ok = fftw_init_threads();
fftw_mpi_init();
int num_threads = number_omp_threads;
if (threads_ok)
{
fftw_plan_with_nthreads(num_threads);
}
else
{
printf("Failure to initialise threads ...\n");
MPI_Finalize();
};
printf("OpenMP threads enabled with FFTW. Number of threads %d\n", fftw_planner_nthreads());
#else
MPI_Init(&argc, &argv);
#endif
PANPHASIA_init_descriptor_(descriptor, &verbose);
rel_level = 6; //Set size of test dataset
if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
{
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
abort();
};
//======================= FFTW ==============================
fftw_mpi_init();
ptrdiff_t alloc_local, local_n0, local_0_start;
ptrdiff_t N0 = descriptor_base_size << rel_level;
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0+2, MPI_COMM_WORLD, &local_n0, &local_0_start);
FFTW_COMPLEX *Panphasia_White_Noise_Field;
Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
{
printf("Error code from PANPHASIA_compute ... %d\n", error);
};
fftw_free(Panphasia_White_Noise_Field);
fftw_mpi_cleanup();
//==================== End FFTW ===========================
MPI_Finalize();
return(0);
}
#endif // STANDALONE_PANPHASIA_HO

22
external/panphasia_ho/makefile vendored Normal file
View file

@ -0,0 +1,22 @@
CC = mpicc -qopenmp
G99 = gcc -fopenmp
GSL_LIBS = -lgsl -lgslcblas
#CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi
CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi -DUSE_OPENMP
pan_fftw3_test_code.x: main.o high_order_panphasia_routines.o pan_mpi_routines.o uniform_rand_threefry4x64.o
$(CC) $(CCFLAGS) *.o -o pan_fftw3_test_code.x
main.o: main.c
$(CC) $(CCFLAGS) -c main.c
high_order_panphasia_routines.o: high_order_panphasia_routines.c
$(CC) $(CCFLAGS) -c high_order_panphasia_routines.c
pan_mpi_routines.o: pan_mpi_routines.c
$(CC) $(CCFLAGS) -c pan_mpi_routines.c
uniform_rand_threefry4x64.o: uniform_rand_threefry4x64.c
$(CC) $(CCFLAGS) -c uniform_rand_threefry4x64.c
clean:
rm *.o

File diff suppressed because it is too large Load diff

533
external/panphasia_ho/pan_mpi_routines.c vendored Normal file
View file

@ -0,0 +1,533 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <complex.h>
#include <fftw3-mpi.h>
#include "PAN_FFTW3.h"
#include "panphasia_functions.h"
#ifdef USE_OPENMP
#include <omp.h>
#endif
extern const int Nbasis;
extern const int irank_p[3][84];
extern size_t descriptor_order;
extern size_t descriptor_kk_limit;
extern size_t descriptor_base_size;
int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_fourier_grid,
ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return,
FFTW_COMPLEX *return_field)
{
size_t copy_list[Nbasis];
int pmax = 6;
int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2;
size_t ncopy = (pmax + 1) * (pmax + 2) * (pmax + 3) / 6;
if (ncopy % nsubdivide != 0)
return (100010);
int nchunk = ncopy / nsubdivide;
int verbose = 1;
int flag_output_mode = 2;
int error;
ptrdiff_t size_to_alloc_fourier;
ptrdiff_t size_to_alloc_pan;
ptrdiff_t local_fourier_x_start, local_fourier_x_end;
FFTW_PLAN output_coeff_forward_plan;
ptrdiff_t N0_pan_grid = descriptor_base_size << relative_level;
if (N0_fourier_grid % N0_pan_grid != 0)
return (100015);
int fdim = N0_fourier_grid / N0_pan_grid;
size_t nfft_dim = N0_fourier_grid;
size_t npan_dim = N0_pan_grid;
int SHARED_FOUR_PAN_SPACE = (nsubdivide == 1) && (fdim == 1) && (sizeof(PAN_REAL) == sizeof(FFTW_REAL));
////////////////////////////////////////////////////////////////////////////////////
if (pmax > descriptor_order)
return (100020);
for (size_t i = 0; i < Nbasis; i++)
copy_list[i] = i;
//printf("Dimensions of FT (%td,%td,%td)\n",N0_fourier_grid,N0_fourier_grid,N0_fourier_grid);
//printf("Dimensions of PG (%td,%td,%td)\n",N0_pan_grid,N0_pan_grid,N0_pan_grid);
//printf("local_no %td local_0_start_fourier %td\n",local_n0_fourier_return, local_0_start_fourier_return);
// Compute 1-D Spherical Bessel coefficients for each order //////////////////
// These are needed for the convolutions below //////////////////
size_t n4dimen;
n4dimen = (nfft_dim % 4 == 0) ? 4 * (nfft_dim / 4) + 4 : 4 * (nfft_dim / 4) + 5;
double complex *sph_bessel_coeff = FFTW_MALLOC(sizeof(double complex) * n4dimen * (pmax + 1));
if (sph_bessel_coeff == NULL)
return (100030);
compute_sph_bessel_coeffs(nfft_dim, pmax, n4dimen, fdim, sph_bessel_coeff);
//printf("Reached here! ndimen4 %ld\n",n4dimen);
///////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
// Determine sizes of Fourier and Panphasia coefficient 3-D arrays //
ptrdiff_t local_n0_pan;
ptrdiff_t local_0_start_pan;
{
int rank = 3;
ptrdiff_t local_n0_fourier;
ptrdiff_t local_0_start_fourier;
const ptrdiff_t ndimens_alloc_fourier[] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid + 2}; // Allocated for r2c
ptrdiff_t howmany = nchunk;
size_to_alloc_fourier = FFTW_MPI_LOCAL_SIZE_MANY(rank, ndimens_alloc_fourier, howmany,
FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD,
&local_n0_fourier, &local_0_start_fourier);
if (local_0_start_fourier != local_0_start_fourier_return)
{
printf("Error local_0_start_fourier!=local_0_start_fourier_return\n");
return (100032);
};
if (local_n0_fourier != local_n0_fourier_return)
{
printf("Error local_n0_fourier!=local_n0_fourier_return\n");
return (100033);
};
ptrdiff_t abs_fourier_x_start, abs_fourier_x_end;
if (local_0_start_fourier_return % fdim == 0)
{
abs_fourier_x_start = local_0_start_fourier_return;
}
else
{
abs_fourier_x_start = local_0_start_fourier_return + fdim - (local_0_start_fourier_return % fdim);
};
if ((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim == 0)
{
abs_fourier_x_end = local_0_start_fourier_return + local_n0_fourier_return - 1;
}
else
{
abs_fourier_x_end = (local_0_start_fourier_return + local_n0_fourier_return - 1) -
((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim);
};
if ((abs_fourier_x_end - abs_fourier_x_start) % fdim != 0)
return (100036);
local_0_start_pan = abs_fourier_x_start / fdim;
local_n0_pan = 1 + (abs_fourier_x_end - abs_fourier_x_start) / fdim;
local_fourier_x_start = abs_fourier_x_start - local_0_start_fourier_return;
local_fourier_x_end = abs_fourier_x_end - local_0_start_fourier_return;
// const ptrdiff_t ndimens_alloc_pan[] = {N0_pan_grid, N0_pan_grid, N0_pan_grid+2};
howmany = ncopy;
size_to_alloc_pan = howmany * local_n0_pan * N0_pan_grid * (N0_pan_grid + 2);
//printf("size_to_alloc_fourier = %td\n",size_to_alloc_fourier);
//printf("size_to_alloc_pan = %td\n",size_to_alloc_pan);
};
/////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
// Allocate arrays to store Panphasia coefficients and Fourier information
// If nsubdivide==1 then use the same structure to store both.
void *panphasia_coefficients = FFTW_MALLOC(sizeof(PAN_REAL) * size_to_alloc_pan);
void *fourier_grids;
if (panphasia_coefficients == NULL)
return (100040);
void *mode_weightings = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
FFTW_REAL *ptr_mode_weightings;
ptr_mode_weightings = mode_weightings;
memset(ptr_mode_weightings, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
FFTW_REAL *ptr_real_fourier_grid;
FFTW_REAL *ptr_panphasia_coefficients = panphasia_coefficients;
FFTW_COMPLEX *ptr_cmplx_fourier_grid;
if (SHARED_FOUR_PAN_SPACE)
{
ptr_real_fourier_grid = panphasia_coefficients;
ptr_cmplx_fourier_grid = panphasia_coefficients;
}
else
{
fourier_grids = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier);
if (fourier_grids == NULL)
return (100041);
ptr_real_fourier_grid = fourier_grids;
ptr_cmplx_fourier_grid = fourier_grids;
};
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
//////////// Compute the Panphasia coefficients ////////////////////////////////
{
size_t xorigin = local_0_start_pan, yorigin = 0, zorigin = 0;
size_t xextent = local_n0_pan, yextent = N0_pan_grid, zextent = N0_pan_grid;
if ((error = PANPHASIA_compute_coefficients_(&xorigin, &yorigin, &zorigin, &xextent, &yextent,
&zextent, copy_list, &ncopy, panphasia_coefficients,
&flag_output_mode, &verbose)))
return (100100 + error);
};
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
////////// Output diagnostics for small grids only
{
if (N0_pan_grid < -1)
{
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char filename[100];
sprintf(filename, "output_real_space_field.%d", rank);
FILE *fp;
printf("local_n0_pan %ld npan_dim %ld N0_pan_grid %ld\n", local_n0_pan,
npan_dim, N0_pan_grid);
fp = fopen(filename, "w");
for (int ix = 0; ix < local_n0_pan; ix++)
for (int iy = 0; iy < npan_dim; iy++)
for (int iz = 0; iz < npan_dim; iz++)
{
int index = ix * N0_pan_grid * (N0_pan_grid + 2) + iy * (N0_pan_grid + 2) + iz;
fprintf(fp, "%6lu%6d%6d %14.8lf %d\n", ix + local_0_start_pan, iy, iz,
ptr_panphasia_coefficients[index], index);
};
fclose(fp);
};
};
//----------------------------------------------------------------------------------
////////////// Set up FTTW plan //////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// printf("Making the plan ... \n");
//////////////////// Make plan for ncopy interleaved FTs ///////////////////////////
//////////////////////////////////////////////////////////////////////////
{
int rank = 3;
const ptrdiff_t ndimens[3] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid};
ptrdiff_t howmany = nchunk;
ptrdiff_t block = FFTW_MPI_DEFAULT_BLOCK;
ptrdiff_t tblock = FFTW_MPI_DEFAULT_BLOCK;
unsigned flags = FFTW_ESTIMATE;
output_coeff_forward_plan = FFTW_MPI_PLAN_MANY_DTF_R2C(rank, ndimens,
howmany, block, tblock,
ptr_real_fourier_grid, ptr_cmplx_fourier_grid,
MPI_COMM_WORLD, flags);
if (output_coeff_forward_plan == NULL)
{
printf("Null plan\n");
return (100051);
};
};
//printf("Plan completed ... \n");
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
//----------------------------------------------------------------------------------
memset(return_field, 0, local_n0_fourier_return * N0_fourier_grid * (N0_fourier_grid + 2) * sizeof(FFTW_REAL));
for (int iter = 0; iter < nsubdivide; iter++)
{
int moffset = iter * nchunk;
if (!SHARED_FOUR_PAN_SPACE)
{
memset(ptr_real_fourier_grid, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier);
// Copy Panphasia coefficients to Fourier grid with appropriate stride - fdim
size_t index_p, index_f;
int m;
for (int ix_f = local_fourier_x_start, ix_p = 0; ix_f <= local_fourier_x_end; ix_f += fdim, ix_p++)
#ifdef USE_OPENMP
#pragma omp parallel for collapse(2) private(index_p, index_f, m)
#endif
for (int iy_f = 0, iy_p = 0; iy_f < nfft_dim; iy_f += fdim, iy_p++)
for (int iz_f = 0, iz_p = 0; iz_f < nfft_dim; iz_f += fdim, iz_p++)
{
index_p = ix_p * N0_pan_grid * (N0_pan_grid + 2) + iy_p * (N0_pan_grid + 2) + iz_p;
index_f = ix_f * N0_fourier_grid * (N0_fourier_grid + 2) + iy_f * (N0_fourier_grid + 2) + iz_f;
for (m = 0; m < nchunk; m++)
{
ptr_real_fourier_grid[nchunk * index_f + m] =
ptr_panphasia_coefficients[ncopy * index_p + moffset + m];
};
};
};
FFTW_MPI_EXECUTE_DFT_R2C(output_coeff_forward_plan,
ptr_real_fourier_grid, ptr_cmplx_fourier_grid);
{
// Convolve and combine the FT of the Panphasia coefficient field
size_t index1, index2;
complex weight;
//int m;
#ifdef USE_OPENMP
#pragma omp parallel for collapse(3) private(index1, index2, weight, m)
#endif
for (int ix = 0; ix < local_n0_fourier_return; ix++)
for (int iy = 0; iy < nfft_dim; iy++)
for (int iz = 0; iz <= nfft_dim / 2; iz++)
{
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
for (int m = 0; m < nchunk; m++)
{
index2 = nchunk * index1 + m;
weight = sph_bessel_coeff[n4dimen * irank_p[0][m + moffset] + ix + local_0_start_fourier_return] *
sph_bessel_coeff[n4dimen * irank_p[1][m + moffset] + iy] *
sph_bessel_coeff[n4dimen * irank_p[2][m + moffset] + iz];
return_field[index1] += weight * ptr_cmplx_fourier_grid[index2];
ptr_mode_weightings[index1] += cabs(weight) * cabs(weight);
};
};
};
}; // End loop over iter
// printf("Reached here 10!\n");
//Add phase shift and normalise field
{
double complex phase_shift_and_scale;
int kx, ky, kz;
const double pi = 4.0 * atan(1.0);
size_t index1;
double min_weight = 100.0;
#ifdef USE_OPENMP
#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, phase_shift_and_scale)
#endif
for (int ix = 0; ix < local_n0_fourier_return; ix++)
for (int iy = 0; iy < nfft_dim; iy++)
for (int iz = 0; iz <= nfft_dim / 2; iz++)
{
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
if ((kx == nfft_dim / 2) || (ky == nfft_dim / 2) || (kz == nfft_dim / 2))
{
// Set Nyquist modes to zero - not used by IC_Gen anyway.
phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift
ptr_mode_weightings[index1] = 0.0; // Set squared weight to zero
}
else
{
phase_shift_and_scale = sqrt((double)(fdim * fdim * fdim)) *
cexp((double)fdim * (-I) * pi * (double)(kx + ky + kz) / sqrt(ptr_mode_weightings[index1]) /
(double)nfft_dim) /
pow((double)nfft_dim, 1.5);
};
return_field[index1] *= phase_shift_and_scale;
if (ptr_mode_weightings[index1] < min_weight)
min_weight = ptr_mode_weightings[index1];
};
// printf("Minimum weight %lf\n",sqrt(min_weight));
};
// printf("Reached here 11!\n");
// Rescale selected Fourier modes to unit amplitude.
// By default this part is not executed.
if (descriptor_kk_limit > 0)
{
size_t index1;
complex weight;
size_t ksquared;
int kx, ky, kz;
#ifdef USE_OPENMP
#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, ksquared, weight)
#endif
for (int ix = 0; ix < local_n0_fourier_return; ix++)
for (int iy = 0; iy < nfft_dim; iy++)
for (int iz = 0; iz <= nfft_dim / 2; iz++)
{
kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
ksquared = kx * kx + ky * ky + kz * kz;
if ((kx != nfft_dim / 2) && (ky != nfft_dim / 2) && (kz != nfft_dim / 2))
{ //Omit Nyquist modes
if ((ksquared <= descriptor_kk_limit) && (ksquared != 0))
{
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
weight = cabs(return_field[index1]);
return_field[index1] /= weight;
};
};
};
};
//printf("Reached here 12!\n");
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char filename[100];
sprintf(filename, "output_k_space_alt.%d", rank);
FILE *fp;
if (nfft_dim < -1)
{
FILE *fp;
fp = fopen(filename, "w");
for (int ix = 0; ix < local_n0_fourier_return; ix++)
for (int iy = 0; iy < nfft_dim; iy++)
for (int iz = 0; iz <= nfft_dim / 2; iz++)
{
int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
// ptr_mode_weightings[index]);
};
fclose(fp);
}
else
{
fp = fopen(filename, "w");
for (int ix = 0; ix < local_n0_fourier_return; ix++)
for (int iy = 0; iy < nfft_dim; iy++)
for (int iz = 0; iz <= nfft_dim / 2; iz++)
{
if (ix + iy + iz + local_0_start_fourier_return < 100)
{
int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
// ptr_mode_weightings[index]);
};
};
fclose(fp);
};
// Transpose output field
{
FFTW_PLAN transpose_output_plan;
unsigned flags = FFTW_ESTIMATE;
void *ptr_inter = return_field;
FFTW_REAL *ptr_return_as_real_field;
ptr_return_as_real_field = ptr_inter;
// int rank = 2;
// ptrdiff_t size_to_transpose;
ptrdiff_t howmany = N0_fourier_grid + 2;
// ptrdiff_t local_n0, local_0_start;
// ptrdiff_t local_n1, local_1_start;
// const ptrdiff_t ndimens[] = {N0_fourier_grid, N0_fourier_grid};
// size_to_transpose = FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED(rank, ndimens, howmany,
// FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
// MPI_COMM_WORLD,&local_n0, &local_0_start,&local_n1, &local_1_start);
// printf("size_to_transpose = %td\n",size_to_transpose);
transpose_output_plan = FFTW_MPI_PLAN_MANY_TRANSPOSE(N0_fourier_grid, N0_fourier_grid,
howmany, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
ptr_return_as_real_field, ptr_return_as_real_field,
MPI_COMM_WORLD, flags);
//printf("Transpose plan completed.\n");
FFTW_MPI_EXECUTE_R2R(transpose_output_plan, ptr_return_as_real_field, ptr_return_as_real_field);
//printf("Transpose completed.\n");
};
// Free all memory assigned by FFTW_MALLOC
FFTW_FREE(mode_weightings);
FFTW_FREE(sph_bessel_coeff);
FFTW_FREE(panphasia_coefficients);
if (!SHARED_FOUR_PAN_SPACE)
FFTW_FREE(fourier_grids);
FFTW_DESTROY_PLAN(output_coeff_forward_plan);
//printf("Reached end of PANPHASIA_compute_kspace_field_\n");
return (0);
}
//==========================================================================================
//==========================================================================================

View file

@ -0,0 +1,99 @@
/////////////////////////////////////////////////
// By default Panphasia is computed at single
// precision. To override this define PAN_DOUBLE
#pragma once
#ifndef USE_PRECISION_FLOAT
#define PAN_DOUBLE_PRECISION 8
#endif
#ifndef PAN_DOUBLE_PRECISION
#define PAN_REAL float
#define PAN_COMPLEX float complex
#else
#define PAN_REAL double
#define PAN_COMPLEX double complex
#endif
#include "PAN_FFTW3.h"
/////////////////////////////////////////////////////////////////////
void return_uniform_pseudo_rands_threefry4x64_(size_t l,size_t j1,size_t j2,size_t j3,
PAN_REAL *panphasia_randoms, size_t seed_value,
size_t allow_non_zero_seed_safety_catch);
void box_muller_(PAN_REAL *unif_real,PAN_REAL *gvar);
void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_cell_vec, int control_flag);
void threefry4x64_test_(int verbose);
void inverse_threefry4x64_test_(int verbose);
void set_panphasia_key_(int verbose);
void check_panphasia_key_(int verbose);
void PANPHASIA_init_descriptor_checks();
void speed_test_();
void speed_test2_();
void check_randoms_();
void test_random_dist_(size_t shift);
void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3,
PAN_REAL *gauss_rand_parent, PAN_REAL *legendre_rand);
void return_root_legendre_coefficients_(PAN_REAL *root);
int parse_and_validate_descriptor_(const char *, int *);
int demo_descriptor_();
long long int compute_check_digit_();
int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose);
int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose);
int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart,
size_t *xextent, size_t *yextent, size_t *zextend,
size_t *copy_list,
size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose);
void test_moments_();
void test_propogation_of_moments_(int iterations);
void test_cell_moments(const char*,size_t, size_t, size_t, size_t, size_t, double *);
void spherical_bessel_(int *, double *, double *);
void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs);
int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen,
size_t *cuboid_y_dimen,size_t *cuboid_z_dimen, size_t *cell_lev,
size_t *cell_x, size_t *cell_y, size_t *cell_z, size_t number_children,
size_t *child_cell_indices);
int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates,
size_t extent, size_t *return_tree_list_coordinates,
size_t nreturn,
long long int *child_pointer, size_t *level_count, size_t *level_begin, size_t *index_perm);
#ifdef __cplusplus
void compute_sph_bessel_coeffs(int, int, int, int, std::complex<double>* *);
#else
void compute_sph_bessel_coeffs(int, int, int, int, double complex *);
#endif
int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *);

874
external/panphasia_ho/threefry.h vendored Normal file
View file

@ -0,0 +1,874 @@
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _threefry_dot_h_
#define _threefry_dot_h_
#include "features/compilerfeatures.h"
#include "array.h"
/** \cond HIDDEN_FROM_DOXYGEN */
/* Significant parts of this file were copied from
from:
Skein_FinalRnd/ReferenceImplementation/skein.h
Skein_FinalRnd/ReferenceImplementation/skein_block.c
in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
This file has been modified so that it may no longer perform its originally
intended function. If you're looking for a Skein or Threefish source code,
please consult the original file.
The original file had the following header:
**************************************************************************
**
** Interface declarations and internal definitions for Skein hashing.
**
** Source code author: Doug Whiting, 2008.
**
** This algorithm and source code is released to the public domain.
**
***************************************************************************
*/
/* See comment at the top of philox.h for the macro pre-process
strategy. */
/* Rotation constants: */
enum r123_enum_threefry64x4 {
/* These are the R_256 constants from the Threefish reference sources
with names changed to R_64x4... */
R_64x4_0_0=14, R_64x4_0_1=16,
R_64x4_1_0=52, R_64x4_1_1=57,
R_64x4_2_0=23, R_64x4_2_1=40,
R_64x4_3_0= 5, R_64x4_3_1=37,
R_64x4_4_0=25, R_64x4_4_1=33,
R_64x4_5_0=46, R_64x4_5_1=12,
R_64x4_6_0=58, R_64x4_6_1=22,
R_64x4_7_0=32, R_64x4_7_1=32
};
enum r123_enum_threefry64x2 {
/*
// Output from skein_rot_search: (srs64_B64-X1000)
// Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
// Start: Tue Mar 1 10:07:48 2011
// rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
*/
R_64x2_0_0=16,
R_64x2_1_0=42,
R_64x2_2_0=12,
R_64x2_3_0=31,
R_64x2_4_0=16,
R_64x2_5_0=32,
R_64x2_6_0=24,
R_64x2_7_0=21
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 8 [ 8 8 8 8 ]
// 6 rounds: minHW = 16 [ 16 16 16 16 ]
// 7 rounds: minHW = 32 [ 32 32 32 32 ]
// 8 rounds: minHW = 64 [ 64 64 64 64 ]
// 9 rounds: minHW = 64 [ 64 64 64 64 ]
//10 rounds: minHW = 64 [ 64 64 64 64 ]
//11 rounds: minHW = 64 [ 64 64 64 64 ] */
};
enum r123_enum_threefry32x4 {
/* Output from skein_rot_search: (srs-B128-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Mon Aug 24 22:41:36 2009
// ...
// rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
R_32x4_0_0=10, R_32x4_0_1=26,
R_32x4_1_0=11, R_32x4_1_1=21,
R_32x4_2_0=13, R_32x4_2_1=27,
R_32x4_3_0=23, R_32x4_3_1= 5,
R_32x4_4_0= 6, R_32x4_4_1=20,
R_32x4_5_0=17, R_32x4_5_1=11,
R_32x4_6_0=25, R_32x4_6_1=10,
R_32x4_7_0=18, R_32x4_7_1=20
/* 4 rounds: minHW = 3 [ 3 3 3 3 ]
// 5 rounds: minHW = 7 [ 7 7 7 7 ]
// 6 rounds: minHW = 12 [ 13 12 13 12 ]
// 7 rounds: minHW = 22 [ 22 23 22 23 ]
// 8 rounds: minHW = 31 [ 31 31 31 31 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry32x2 {
/* Output from skein_rot_search (srs32x2-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Tue Jul 12 11:11:33 2011
// rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
R_32x2_0_0=13,
R_32x2_1_0=15,
R_32x2_2_0=26,
R_32x2_3_0= 6,
R_32x2_4_0=17,
R_32x2_5_0=29,
R_32x2_6_0=16,
R_32x2_7_0=24
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 6 [ 6 8 6 8 ]
// 6 rounds: minHW = 9 [ 9 12 9 12 ]
// 7 rounds: minHW = 16 [ 16 24 16 24 ]
// 8 rounds: minHW = 32 [ 32 32 32 32 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry_wcnt {
WCNT2=2,
WCNT4=4
};
#if R123_USE_64BIT
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
{
return (x << (N & 63)) | (x >> ((64-N) & 63));
}
#endif
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
{
return (x << (N & 31)) | (x >> ((32-N) & 31));
}
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
#define SKEIN_KS_PARITY32 0x1BD11BDA
/** \endcond */
#ifndef THREEFRY2x32_DEFAULT_ROUNDS
#define THREEFRY2x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY2x64_DEFAULT_ROUNDS
#define THREEFRY2x64_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x32_DEFAULT_ROUNDS
#define THREEFRY4x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x64_DEFAULT_ROUNDS
#define THREEFRY4x64_DEFAULT_ROUNDS 20
#endif
#define _threefry2x_tpl(W) \
typedef struct r123array2x##W threefry2x##W##_ctr_t; \
typedef struct r123array2x##W threefry2x##W##_key_t; \
typedef struct r123array2x##W threefry2x##W##_ukey_t; \
R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
uint##W##_t X0,X1; \
uint##W##_t ks0, ks1, ks2; \
R123_ASSERT(Nrounds<=32); \
ks2 = SKEIN_KS_PARITY##W; \
ks0 = k.v[0]; \
X0 = in.v[0] + ks0; \
ks2 ^= ks0; \
\
ks1 = k.v[1]; \
X1 = in.v[1] + ks1; \
ks2 ^= ks1; \
\
if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X0 += ks1; X1 += ks2; \
X1 += 1; /* X.v[2-1] += r */ \
} \
if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X0 += ks2; X1 += ks0; \
X1 += 2; \
} \
if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X0 += ks0; X1 += ks1; \
X1 += 3; \
} \
if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
if(Nrounds>15){ \
/* InjectKey(r=4) */ \
X0 += ks1; X1 += ks2; \
X1 += 4; \
} \
if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
if(Nrounds>19){ \
/* InjectKey(r=5) */ \
X0 += ks2; X1 += ks0; \
X1 += 5; \
} \
if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
if(Nrounds>23){ \
/* InjectKey(r=6) */ \
X0 += ks0; X1 += ks1; \
X1 += 6; \
} \
if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
if(Nrounds>27){ \
/* InjectKey(r=7) */ \
X0 += ks1; X1 += ks2; \
X1 += 7; \
} \
if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
if(Nrounds>31){ \
/* InjectKey(r=8) */ \
X0 += ks2; X1 += ks0; \
X1 += 8; \
} \
threefry2x##W##_ctr_t ret={{X0, X1}}; \
return ret; \
} \
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
}
#define _threefry4x_tpl(W) \
typedef struct r123array4x##W threefry4x##W##_ctr_t; \
typedef struct r123array4x##W threefry4x##W##_key_t; \
typedef struct r123array4x##W threefry4x##W##_ukey_t; \
R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
uint##W##_t X0, X1, X2, X3; \
uint##W##_t ks0, ks1, ks2, ks3, ks4; \
R123_ASSERT(Nrounds<=72); \
ks4 = SKEIN_KS_PARITY##W; \
ks0 = k.v[0]; \
X0 = in.v[0] + ks0; \
ks4 ^= ks0; \
\
ks1 = k.v[1]; \
X1 = in.v[1] + ks1; \
ks4 ^= ks1; \
\
ks2 = k.v[2]; \
X2 = in.v[2] + ks2; \
ks4 ^= ks2; \
\
ks3 = k.v[3]; \
X3 = in.v[3] + ks3; \
ks4 ^= ks3; \
\
if(Nrounds>0){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>1){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>2){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>3){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
X3 += 1; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>4){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>5){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>6){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>7){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
X3 += 2; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>8){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>9){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>10){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>11){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
X3 += 3; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>12){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>13){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>14){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>15){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>15){ \
/* InjectKey(r=1) */ \
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
X3 += 4; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>16){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>17){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>18){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>19){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>19){ \
/* InjectKey(r=1) */ \
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
X3 += 5; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>20){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>21){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>22){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>23){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>23){ \
/* InjectKey(r=1) */ \
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
X3 += 6; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>24){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>25){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>26){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>27){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>27){ \
/* InjectKey(r=1) */ \
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
X3 += 7; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>28){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>29){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>30){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>31){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>31){ \
/* InjectKey(r=1) */ \
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
X3 += 8; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>32){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>33){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>34){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>35){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>35){ \
/* InjectKey(r=1) */ \
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
X3 += 9; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>36){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>37){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>38){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>39){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>39){ \
/* InjectKey(r=1) */ \
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
X3 += 10; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>40){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>41){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>42){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>43){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>43){ \
/* InjectKey(r=1) */ \
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
X3 += 11; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>44){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>45){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>46){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>47){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>47){ \
/* InjectKey(r=1) */ \
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
X3 += 12; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>48){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>49){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>50){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>51){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>51){ \
/* InjectKey(r=1) */ \
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
X3 += 13; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>52){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>53){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>54){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>55){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>55){ \
/* InjectKey(r=1) */ \
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
X3 += 14; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>56){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>57){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>58){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>59){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>59){ \
/* InjectKey(r=1) */ \
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
X3 += 15; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>60){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>61){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>62){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>63){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>63){ \
/* InjectKey(r=1) */ \
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
X3 += 16; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>64){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
} \
if(Nrounds>65){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
} \
if(Nrounds>66){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
} \
if(Nrounds>67){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
} \
if(Nrounds>67){ \
/* InjectKey(r=1) */ \
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
X3 += 17; /* XWCNT4-1 += r */ \
} \
\
if(Nrounds>68){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
} \
if(Nrounds>69){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
} \
if(Nrounds>70){ \
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
} \
if(Nrounds>71){ \
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
} \
if(Nrounds>71){ \
/* InjectKey(r=1) */ \
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
X3 += 18; /* XWCNT4-1 += r */ \
} \
\
threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
return ret; \
} \
\
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
R123_CUDA_DEVICE R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
}
#if R123_USE_64BIT
_threefry2x_tpl(64)
_threefry4x_tpl(64)
#endif
_threefry2x_tpl(32)
_threefry4x_tpl(32)
/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
than a static inline function. Why? */
#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
#if defined(__cplusplus)
#define _threefryNxWclass_tpl(NxW) \
namespace r123{ \
template<unsigned int ROUNDS> \
struct Threefry##NxW##_R{ \
typedef threefry##NxW##_ctr_t ctr_type; \
typedef threefry##NxW##_key_t key_type; \
typedef threefry##NxW##_key_t ukey_type; \
static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
return threefry##NxW##_R(ROUNDS, ctr, key); \
} \
}; \
typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
} // namespace r123
_threefryNxWclass_tpl(2x32)
_threefryNxWclass_tpl(4x32)
#if R123_USE_64BIT
_threefryNxWclass_tpl(2x64)
_threefryNxWclass_tpl(4x64)
#endif
/* The _tpl macros don't quite work to do string-pasting inside comments.
so we just write out the boilerplate documentation four times... */
/**
@defgroup ThreefryNxW Threefry Classes and Typedefs
The ThreefryNxW classes export the member functions, typedefs and
operator overloads required by a @ref CBRNG "CBRNG" class.
As described in
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
the Threefry family is closely related to the Threefish block cipher from
<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
Threefry is \b not suitable for cryptographic use.
Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
@class r123::Threefry2x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=13 or more for Threefry2x32.
@typedef r123::Threefry2x32
@ingroup ThreefryNxW
Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
Threefry2x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry2x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
In November 2011, the authors discovered that 13 rounds of
Threefry2x64 sequenced by strided, interleaved key and counter
increments failed a very long (longer than the default BigCrush
length) WeightDistrub test. At the same time, it was confirmed that
14 rounds passes much longer tests (up to 5x10^12 samples) of a
similar nature. The authors know of no statistical flaws with
ROUNDS=14 or more for Threefry2x64.
@typedef r123::Threefry2x64
@ingroup ThreefryNxW
Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
Threefry2x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x32.
@typedef r123::Threefry4x32
@ingroup ThreefryNxW
Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
Threefry4x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x64.
@typedef r123::Threefry4x64
@ingroup ThreefryNxW
Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
Threefry4x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
*/
#endif
#endif

File diff suppressed because it is too large Load diff

View file

@ -136,15 +136,13 @@ inline void HDFGetDatasetExtent( const std::string Filename, const std::string O
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
hsize_t *dimsize = new hsize_t[ndims];
std::vector<hsize_t> dimsize(ndims,0);
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
Extent.clear();
for(int i=0; i<ndims; ++i )
Extent.push_back( dimsize[i] );
delete[] dimsize;
H5Sclose( HDF_DataspaceID );
H5Dclose( HDF_DatasetID );
@ -323,8 +321,8 @@ inline void HDFReadVectorSelect( const std::string Filename, const std::string O
//... get space associated with dataset and its extensions
HDF_DataspaceID = H5Dget_space( HDF_DatasetID );
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
hsize_t dimsize[ndims];
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
std::vector<hsize_t> dimsize(ndims,0);
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
hsize_t block[2];
block[0] = ii.size();
@ -601,9 +599,9 @@ inline void HDFReadGroupAttribute( const std::string Filename, const std::string
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
hsize_t dimsize[ndims];
std::vector<hsize_t> dimsize(ndims,0);
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
HDF_StorageSize = 1;
for(int i=0; i<ndims; ++i )

View file

@ -1,8 +1,10 @@
#pragma once
constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
#define USE_PRECISION_${CODE_PRECISION}
#ifdef __cplusplus
constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
#if defined(USE_PRECISION_FLOAT)
constexpr char CMAKE_PRECISION_STR[] = "single";
#elif defined(USE_PRECISION_DOUBLE)
@ -31,4 +33,6 @@ extern "C"
extern const char *GIT_TAG;
extern const char *GIT_REV;
extern const char *GIT_BRANCH;
}
}
#endif // __cplusplus

View file

@ -216,6 +216,10 @@ namespace particle
const size_t num_p_in_load = field.local_size();
const real_t pmeanmass = munit / real_t(field.global_size()* overload);
bool bmass_negative = false;
auto mean_pm = field.mean() * pmeanmass;
auto std_pm = field.std() * pmeanmass;
for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
{
// if we are dealing with the secondary lattice, apply a global shift
@ -237,18 +241,25 @@ namespace particle
{
for (size_t k = 0; k < field.size(2); ++k)
{
if (b64reals)
{
particles_.set_mass64(ipcount++, pmeanmass * field.relem(i, j, k));
}
else
{
particles_.set_mass32(ipcount++, pmeanmass * field.relem(i, j, k));
}
// get
const auto pmass = pmeanmass * field.relem(i, j, k);
// check for negative mass
bmass_negative |= pmass<0.0;
// set
if (b64reals) particles_.set_mass64(ipcount++, pmass);
else particles_.set_mass32(ipcount++, pmass);
}
}
}
}
// diagnostics
music::ilog << "Particle Mass : mean/munit = " << mean_pm/munit << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl;
if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t the starting redshift or disabling baryon decaying modes." << std::endl;
if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t redshift or disable baryon decaying modes!" << std::endl;
}else{
// should not happen
music::elog << "Cannot have individual particle masses for glasses!" << std::endl;

View file

@ -591,6 +591,10 @@ int run( config_file& the_config )
}
}
}
#if defined(USE_MPI)
real_t local_maxdphi = maxdphi;
MPI_Allreduce( &local_maxdphi, &maxdphi, 1, MPI::get_datatype<real_t>(), MPI_MAX, MPI_COMM_WORLD );
#endif
const real_t hbar_safefac = 1.01;
const real_t hbar = maxdphi / M_PI / Dplus0 * hbar_safefac;
music::ilog << "Semiclassical PT : hbar = " << hbar << " (limited by initial potential, safety=" << hbar_safefac << ")." << std::endl;
@ -832,6 +836,9 @@ int run( config_file& the_config )
}
}
music::ilog << "-------------------------------------------------------------------------------" << std::endl;
}
return 0;
}

498
src/plugins/PANPHASIA.hh Normal file
View file

@ -0,0 +1,498 @@
#pragma once
namespace PANPHASIA1
{
const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
typedef int rand_offset_[5];
typedef struct
{
int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
int need_fill;
int pos;
} rand_state_;
typedef struct
{
int base_state[5], base_lev_start[5][maxdim + 1];
rand_offset_ poweroffset[maxpow + 1], superjump;
rand_state_ current_state[maxpow + 2];
int layer_min, layer_max, indep_field;
long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
int lev_common, layer_min_store, layer_max_store;
long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
iy_rel_store, iz_rel_store;
double exp_coeffs[8][8][maxdim + 2];
long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
double cell_data[9][8];
int ixh_last, iyh_last, izh_last;
int init;
int init_cell_props;
int init_lecuyer_state;
long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
} pan_state_;
extern "C"
{
void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
int *layer_max, int *indep_field, double *cell_prop);
void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
long long *iy_rel, long long *iz_rel, int *VERBOSE);
}
struct RNG
{
config_file *pcf_;
struct panphasia_descriptor
{
int16_t wn_level_base;
int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
int16_t i_base, i_base_y, i_base_z;
int32_t check_rand;
std::string name;
explicit panphasia_descriptor(std::string dstring)
{
char tmp[100];
std::memset(tmp, ' ', 100);
parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
&i_base_y, &i_base_z, &check_rand, tmp);
for (int i = 0; i < 100; i++)
if (tmp[i] == ' ')
{
tmp[i] = '\0';
break;
}
name = tmp;
name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
}
};
// greatest common divisor
int gcd(int a, int b)
{
if (b == 0)
return a;
return gcd(b, a % b);
}
// least common multiple
int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
// Two or largest power of 2 less than the argument
int largest_power_two_lte(int b)
{
int a = 1;
if (b <= a)
return a;
while (2 * a < b)
a = 2 * a;
return a;
}
std::string descriptor_string_;
int num_threads_;
int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
bool incongruent_fields_;
double inter_grid_phase_adjustment_;
// double translation_phase_;
pan_state_ *lstate;
int grid_p_;
int coordinate_system_shift_[3];
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
void clear_panphasia_thread_states(void)
{
for (int i = 0; i < num_threads_; ++i)
{
lstate[i].init = 0;
lstate[i].init_cell_props = 0;
lstate[i].init_lecuyer_state = 0;
}
}
void initialize_for_grid_structure(void)
{
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution", 512);
grid_p_ = pdescriptor_->i_base;
lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
// lmin
ngrid_panphasia_ = (1 << lextra_) * grid_p_;
while (ngrid_panphasia_ < ngridminsize_panphasia)
{
lextra_++;
ngrid_panphasia_ *= 2;
}
assert(ngrid_panphasia_ >= ngridminsize_panphasia);
clear_panphasia_thread_states();
music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)", ngrid_panphasia_, lextra_);
if (ngridminsize_panphasia < 512)
music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512", ngridminsize_panphasia);
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_);
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
}
std::unique_ptr<panphasia_descriptor> pdescriptor_;
RNG(config_file *pcf)
: pcf_(pcf)
{
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
#ifdef _OPENMP
num_threads_ = omp_get_max_threads();
#else
num_threads_ = 1;
#endif
// create independent state descriptions for each thread
lstate = new pan_state_[num_threads_];
// parse the descriptor for its properties
pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
// write panphasia base size into config file for the grid construction
// as the gridding unit we use the least common multiple of 2 and i_base
std::stringstream ss;
//ARJ ss << lcm(2, pdescriptor_->i_base);
//ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
ss << 2; //ARJ - set gridding unit to two
pcf_->insert_value("setup", "gridding_unit", ss.str());
ss.str(std::string());
ss << pdescriptor_->i_base;
pcf_->insert_value("random", "base_unit", ss.str());
this->initialize_for_grid_structure();
}
~RNG() { delete[] lstate; }
void Fill(Grid_FFT<real_t> &g)
{
auto sinc = [](real_t x)
{ return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
auto dsinc = [](real_t x)
{ return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
// we will overwrite 'g', we can deallocate it while we prepare the panphasia field
g.reset();
clear_panphasia_thread_states();
// temporaries
Grid_FFT<real_t> g0({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g1({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g2({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g3({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g4({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
double t1 = get_wtime();
// double tp = t1;
#pragma omp parallel
{
#ifdef _OPENMP
const int mythread = omp_get_thread_num();
#else
const int mythread = 0;
#endif
//int odd_x, odd_y, odd_z;
//int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
int verbosity = (mythread == 0);
char descriptor[100];
std::memset(descriptor, 0, 100);
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
{
panphasia_descriptor d(descriptor_string_);
int level_p = d.wn_level_base + lextra_;
lstate[mythread].layer_min = 0;
lstate[mythread].layer_max = level_p;
lstate[mythread].indep_field = 1;
long long ix_rel[3];
ix_rel[0] = 0; //ileft_corner_p[0];
ix_rel[1] = 0; //ileft_corner_p[1];
ix_rel[2] = 0; //ileft_corner_p[2];
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
&verbosity);
}
if (verbosity)
t1 = get_wtime();
std::array<double, 9> cell_prop;
pan_state_ *ps = &lstate[mythread];
#pragma omp for //nowait
for (size_t i = 0; i < g0.size(0); i += 2)
{
const int ixmax(std::min<int>(2, g0.size(0) - i));
for (size_t j = 0; j < g0.size(1); j += 2)
{
const int iymax(std::min<int>(2, g0.size(1) - j));
for (size_t k = 0; k < g0.size(2); k += 2)
{
const int izmax(std::min<int>(2, g0.size(2) - k));
// ARJ - added inner set of loops to speed up evaluation of Panphasia
for (int ix = 0; ix < ixmax; ++ix)
{
for (int iy = 0; iy < iymax; ++iy)
{
for (int iz = 0; iz < izmax; ++iz)
{
int ilocal = i + ix;
int jlocal = j + iy;
int klocal = k + iz;
int iglobal = ilocal + g0.local_0_start_;
int jglobal = jlocal;
int kglobal = klocal;
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
}
}
}
}
}
}
} // end omp parallel region
g0.FourierTransformForward();
g1.FourierTransformForward();
g2.FourierTransformForward();
g3.FourierTransformForward();
g4.FourierTransformForward();
#pragma omp parallel for
for (size_t i = 0; i < g0.size(0); i++)
{
for (size_t j = 0; j < g0.size(1); j++)
{
for (size_t k = 0; k < g0.size(2); k++)
{
if (!g0.is_nyquist_mode(i, j, k))
{
auto kvec = g0.get_k<real_t>(i, j, k);
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
auto fx = real_t(sinc(argx));
auto gx = ccomplex_t(0.0, dsinc(argx));
auto fy = real_t(sinc(argy));
auto gy = ccomplex_t(0.0, dsinc(argy));
auto fz = real_t(sinc(argz));
auto gz = ccomplex_t(0.0, dsinc(argz));
auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
g0.kelem(i, j, k) = y0 * fx * fy * fz + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) + y4 * magnitude;
}
else
{
g0.kelem(i, j, k) = 0.0;
}
}
}
}
// music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
// tp = get_wtime();
g1.FourierTransformBackward(false);
g2.FourierTransformBackward(false);
g3.FourierTransformBackward(false);
g4.FourierTransformBackward(false);
#pragma omp parallel
{
#ifdef _OPENMP
const int mythread = omp_get_thread_num();
#else
const int mythread = 0;
#endif
// int odd_x, odd_y, odd_z;
int verbosity = (mythread == 0);
char descriptor[100];
std::memset(descriptor, 0, 100);
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
{
panphasia_descriptor d(descriptor_string_);
int level_p = d.wn_level_base + lextra_;
lstate[mythread].layer_min = 0;
lstate[mythread].layer_max = level_p;
lstate[mythread].indep_field = 1;
long long ix_rel[3];
ix_rel[0] = 0; //ileft_corner_p[0];
ix_rel[1] = 0; //ileft_corner_p[1];
ix_rel[2] = 0; //ileft_corner_p[2];
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
&verbosity);
}
if (verbosity)
t1 = get_wtime();
//***************************************************************
// Process Panphasia values: p110, p011, p101, p111
//****************************************************************
std::array<double, 9> cell_prop;
pan_state_ *ps = &lstate[mythread];
#pragma omp for //nowait
for (size_t i = 0; i < g1.size(0); i += 2)
{
const int ixmax(std::min<int>(2, g1.size(0) - i));
for (size_t j = 0; j < g1.size(1); j += 2)
{
const int iymax(std::min<int>(2, g1.size(1) - j));
for (size_t k = 0; k < g1.size(2); k += 2)
{
const int izmax(std::min<int>(2, g1.size(2) - k));
// ARJ - added inner set of loops to speed up evaluation of Panphasia
for (int ix = 0; ix < ixmax; ++ix)
{
for (int iy = 0; iy < iymax; ++iy)
{
for (int iz = 0; iz < izmax; ++iz)
{
int ilocal = i + ix;
int jlocal = j + iy;
int klocal = k + iz;
int iglobal = ilocal + g1.local_0_start_;
int jglobal = jlocal;
int kglobal = klocal;
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
}
}
}
}
}
}
} // end omp parallel region
// music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
// tp = get_wtime();
/////////////////////////////////////////////////////////////////////////
// transform and convolve with Legendres
g1.FourierTransformForward();
g2.FourierTransformForward();
g3.FourierTransformForward();
g4.FourierTransformForward();
#pragma omp parallel for
for (size_t i = 0; i < g0.size(0); i++)
{
for (size_t j = 0; j < g0.size(1); j++)
{
for (size_t k = 0; k < g0.size(2); k++)
{
if (!g0.is_nyquist_mode(i, j, k))
{
auto kvec = g0.get_k<real_t>(i, j, k);
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
auto fx = real_t(sinc(argx));
auto gx = ccomplex_t(0.0, dsinc(argx));
auto fy = real_t(sinc(argy));
auto gy = ccomplex_t(0.0, dsinc(argy));
auto fz = real_t(sinc(argz));
auto gz = ccomplex_t(0.0, dsinc(argz));
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
// do final phase shift to account for corner centered coordinates vs. cell centers
auto phase_shift = -0.5 * M_PI * (kvec[0] / g0.kny_[0] + kvec[1] / g0.kny_[1] + kvec[2] / g0.kny_[2]);
g0.kelem(i, j, k) *= std::exp(ccomplex_t(0, phase_shift));
}
}
}
}
g1.reset();
g2.reset();
g3.reset();
g4.reset();
g.allocate();
g0.FourierInterpolateCopyTo(g);
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
}
};
};

View file

@ -51,7 +51,7 @@ protected:
real_t lunit_, vunit_, munit_, omegab_;
uint32_t levelmin_;
bool bhavebaryons_;
std::vector<float> data_buf_;
std::vector<float> data_buf_, data_buf_write_;
std::string dirname_;
bool bUseSPT_;
@ -207,52 +207,59 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
std::string file_name = this->get_file_name(s, c);
// serialize parallel write
for (int write_rank = 0; write_rank < CONFIG::MPI_task_size; ++write_rank)
if (CONFIG::MPI_task_rank == 0)
{
if (write_rank == CONFIG::MPI_task_rank)
unlink(file_name.c_str());
}
std::ofstream *pofs;
// write header or seek to end of file
if (CONFIG::MPI_task_rank == 0)
{
pofs = new std::ofstream(file_name.c_str(), std::ios::binary|std::ios::app);
uint32_t blocksz = sizeof(header);
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
pofs->write(reinterpret_cast<const char *>(&header_), blocksz);
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
}
// check field size against buffer size...
uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");
assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
assert( g.size(1) == ngrid && g.size(2) == ngrid);
// write actual field slice by slice
// std::cerr << write_rank << ">" << g.size(0) << " " << g.size(1) << " " << g.size(2) << std::endl;
for (size_t i = 0; i < g.size(2); ++i)
{
data_buf_.assign(ngrid * ngrid, 0.0f);
for (unsigned j = 0; j < g.size(1); ++j)
{
if (write_rank == 0)
for (unsigned k = 0; k < g.size(0); ++k)
{
unlink(file_name.c_str());
data_buf_[j * ngrid + (k+g.local_0_start_)] = g.relem(k, j, i);
}
std::ofstream ofs(file_name.c_str(), std::ios::binary|std::ios::app);
// write header or seek to end of file
if (write_rank == 0)
{
uint32_t blocksz = sizeof(header);
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
ofs.write(reinterpret_cast<const char *>(&header_), blocksz);
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
}
// check field size against buffer size...
uint32_t ngrid = cf_.get_value<int>("setup", "GridRes");
assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
assert( g.size(1) == ngrid && g.size(2) == ngrid);
// write actual field slice by slice
for (size_t i = 0; i < g.size(2); ++i)
{
for (unsigned j = 0; j < g.size(1); ++j)
{
for (unsigned k = 0; k < g.size(0); ++k)
{
data_buf_[j * ngrid + k] = g.relem(k, j, i);
}
}
uint32_t blocksz = ngrid * ngrid * sizeof(float);
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
ofs.write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
}
ofs.close();
}
#if defined(USE_MPI)
if( CONFIG::MPI_task_rank == 0 ) data_buf_write_.assign(ngrid*ngrid,0.0f);
MPI_Reduce( &data_buf_[0], &data_buf_write_[0], ngrid*ngrid, MPI::get_datatype<float>(), MPI_SUM, 0, MPI_COMM_WORLD );
if( CONFIG::MPI_task_rank == 0 ) data_buf_.swap(data_buf_write_);
#endif
multitask_sync_barrier();
if( CONFIG::MPI_task_rank == 0 )
{
uint32_t blocksz = ngrid * ngrid * sizeof(float);
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
pofs->write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
}
}
} // end loop over write_rank
if( CONFIG::MPI_task_rank == 0 ){
pofs->close();
delete pofs;
}
music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
}

View file

@ -1,18 +1,18 @@
// This file is part of monofonIC (MUSIC2)
// A software package to generate ICs for cosmological simulations
// Copyright (C) 2020 by Oliver Hahn and Adrian Jenkins (this file)
// Copyright (C) 2021 by Oliver Hahn and Adrian Jenkins (this file)
// but see distinct licensing for PANPHASIA below
//
//
// monofonIC is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
//
// monofonIC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
@ -39,108 +39,16 @@
#include <grid_fft.hh>
const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
typedef int rand_offset_[5];
typedef struct
namespace PANPHASIA2
{
int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
int need_fill;
int pos;
} rand_state_;
/* pan_state_ struct -- corresponds to respective fortran module in panphasia_routines.f
* data structure that contains all panphasia state variables
* it needs to get passed between the fortran routines to enable
* thread-safe execution.
*/
typedef struct
{
int base_state[5], base_lev_start[5][maxdim + 1];
rand_offset_ poweroffset[maxpow + 1], superjump;
rand_state_ current_state[maxpow + 2];
int layer_min, layer_max, indep_field;
long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
int lev_common, layer_min_store, layer_max_store;
long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
iy_rel_store, iz_rel_store;
double exp_coeffs[8][8][maxdim + 2];
long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
double cell_data[9][8];
int ixh_last, iyh_last, izh_last;
int init;
int init_cell_props;
int init_lecuyer_state;
long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
} pan_state_;
extern "C"
{
void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
int *layer_max, int *indep_field, double *cell_prop);
void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
long long *iy_rel, long long *iz_rel, int *VERBOSE);
}
struct panphasia_descriptor
{
int16_t wn_level_base;
int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
int16_t i_base, i_base_y, i_base_z;
int32_t check_rand;
std::string name;
explicit panphasia_descriptor(std::string dstring)
extern "C"
{
char tmp[100];
std::memset(tmp, ' ', 100);
parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
&i_base_y, &i_base_z, &check_rand, tmp);
for (int i = 0; i < 100; i++)
if (tmp[i] == ' ')
{
tmp[i] = '\0';
break;
}
name = tmp;
name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
#include <panphasia_ho/panphasia_functions.h>
extern size_t descriptor_base_size;
}
};
// greatest common divisor
int gcd(int a, int b)
{
if (b == 0)
return a;
return gcd(b, a % b);
}
// least common multiple
int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
// Two or largest power of 2 less than the argument
int largest_power_two_lte(int b)
{
int a = 1;
if (b <= a)
return a;
while (2 * a < b)
a = 2 * a;
return a;
}
#include "PANPHASIA.hh"
class RNG_panphasia : public RNG_plugin
{
@ -148,63 +56,15 @@ private:
protected:
std::string descriptor_string_;
int num_threads_;
int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
bool incongruent_fields_;
double inter_grid_phase_adjustment_;
// double translation_phase_;
pan_state_ *lstate;
int grid_p_;
int coordinate_system_shift_[3];
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
int panphasia_mode_;
size_t grid_res_;
real_t boxlength_;
void clear_panphasia_thread_states(void)
{
for (int i = 0; i < num_threads_; ++i)
{
lstate[i].init = 0;
lstate[i].init_cell_props = 0;
lstate[i].init_lecuyer_state = 0;
}
}
void initialize_for_grid_structure(void)
{
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution",512);
grid_p_ = pdescriptor_->i_base;
lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
// lmin
ngrid_panphasia_ = (1 << lextra_) * grid_p_;
while( ngrid_panphasia_ < ngridminsize_panphasia ){
lextra_++;
ngrid_panphasia_*=2;
}
assert( ngrid_panphasia_ >= ngridminsize_panphasia);
clear_panphasia_thread_states();
music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)",ngrid_panphasia_, lextra_);
if (ngridminsize_panphasia<512)
music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512",ngridminsize_panphasia);
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_ );
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
}
std::unique_ptr<panphasia_descriptor> pdescriptor_;
PANPHASIA1::RNG *ppan1_rng_;
public:
explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf)
{
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
#ifdef _OPENMP
num_threads_ = omp_get_max_threads();
@ -212,336 +72,120 @@ public:
num_threads_ = 1;
#endif
// create independent state descriptions for each thread
lstate = new pan_state_[num_threads_];
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
grid_res_ = pcf_->get_value<size_t>("setup", "GridRes");
boxlength_ = pcf_->get_value<real_t>("setup", "BoxLength");
// parse the descriptor for its properties
pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
if( pcf_->get_value<bool>("setup", "DoFixing") ){
music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n";
music::flog << "With the new panphasia ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n";
throw std::runtime_error("PANPHASIA: incompatible parameter.");
}
music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
panphasia_mode_ = 0;
PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_);
// write panphasia base size into config file for the grid construction
// as the gridding unit we use the least common multiple of 2 and i_base
std::stringstream ss;
//ARJ ss << lcm(2, pdescriptor_->i_base);
//ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
ss << 2; //ARJ - set gridding unit to two
pcf_->insert_value("setup", "gridding_unit", ss.str());
ss.str(std::string());
ss << pdescriptor_->i_base;
pcf_->insert_value("random", "base_unit", ss.str());
this->initialize_for_grid_structure();
if (panphasia_mode_ == 0)
{
ppan1_rng_ = new PANPHASIA1::RNG(&cf);
}
}
~RNG_panphasia() { delete[] lstate; }
~RNG_panphasia()
{
if (panphasia_mode_ == 0)
{
delete ppan1_rng_;
}
}
bool isMultiscale() const { return true; }
void Run_Panphasia_Highorder(Grid_FFT<real_t> &g);
void Fill_Grid(Grid_FFT<real_t> &g)
{
auto sinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
// we will overwrite 'g', we can deallocate it while we prepare the panphasia field
g.reset();
clear_panphasia_thread_states();
// temporaries
Grid_FFT<real_t> g0({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g1({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g2({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g3({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
Grid_FFT<real_t> g4({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
double t1 = get_wtime();
// double tp = t1;
#pragma omp parallel
switch (panphasia_mode_)
{
#ifdef _OPENMP
const int mythread = omp_get_thread_num();
#else
const int mythread = 0;
#endif
//int odd_x, odd_y, odd_z;
//int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
case 0: // old mode
music::ilog << "PANPHASIA: Old descriptor" << std::endl;
ppan1_rng_->Fill(g);
break;
int verbosity = (mythread == 0);
char descriptor[100];
std::memset(descriptor, 0, 100);
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
case 1: // PANPHASIA HO descriptor
music::ilog << "PANPHASIA: New descriptor" << std::endl;
this->Run_Panphasia_Highorder(g);
break;
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
{
panphasia_descriptor d(descriptor_string_);
int level_p = d.wn_level_base + lextra_;
lstate[mythread].layer_min = 0;
lstate[mythread].layer_max = level_p;
lstate[mythread].indep_field = 1;
long long ix_rel[3];
ix_rel[0] = 0; //ileft_corner_p[0];
ix_rel[1] = 0; //ileft_corner_p[1];
ix_rel[2] = 0; //ileft_corner_p[2];
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
&verbosity);
}
if (verbosity)
t1 = get_wtime();
std::array<double, 9> cell_prop;
pan_state_ *ps = &lstate[mythread];
#pragma omp for //nowait
for (size_t i = 0; i < g0.size(0); i += 2)
{
const int ixmax(std::min<int>(2,g0.size(0)-i));
for (size_t j = 0; j < g0.size(1); j += 2)
{
const int iymax(std::min<int>(2,g0.size(1)-j));
for (size_t k = 0; k < g0.size(2); k += 2)
{
const int izmax(std::min<int>(2,g0.size(2)-k));
// ARJ - added inner set of loops to speed up evaluation of Panphasia
for (int ix = 0; ix < ixmax; ++ix)
{
for (int iy = 0; iy < iymax; ++iy)
{
for (int iz = 0; iz < izmax; ++iz)
{
int ilocal = i + ix;
int jlocal = j + iy;
int klocal = k + iz;
int iglobal = ilocal + g0.local_0_start_;
int jglobal = jlocal;
int kglobal = klocal;
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
}
}
}
}
}
}
} // end omp parallel region
g0.FourierTransformForward();
g1.FourierTransformForward();
g2.FourierTransformForward();
g3.FourierTransformForward();
g4.FourierTransformForward();
#pragma omp parallel for
for (size_t i = 0; i < g0.size(0); i++)
{
for (size_t j = 0; j < g0.size(1); j++)
{
for (size_t k = 0; k < g0.size(2); k++)
{
if (!g0.is_nyquist_mode(i, j, k))
{
auto kvec = g0.get_k<real_t>(i, j, k);
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
auto fx = real_t(sinc(argx));
auto gx = ccomplex_t(0.0, dsinc(argx));
auto fy = real_t(sinc(argy));
auto gy = ccomplex_t(0.0, dsinc(argy));
auto fz = real_t(sinc(argz));
auto gz = ccomplex_t(0.0, dsinc(argz));
auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
g0.kelem(i, j, k) = y0 * fx * fy * fz
+ sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz)
+ y4 * magnitude;
}
else
{
g0.kelem(i, j, k) = 0.0;
}
}
}
default: // unknown PANPHASIA mode
music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl;
abort();
break;
}
// music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
// tp = get_wtime();
g1.FourierTransformBackward(false);
g2.FourierTransformBackward(false);
g3.FourierTransformBackward(false);
g4.FourierTransformBackward(false);
#pragma omp parallel
{
#ifdef _OPENMP
const int mythread = omp_get_thread_num();
#else
const int mythread = 0;
#endif
// int odd_x, odd_y, odd_z;
int verbosity = (mythread == 0);
char descriptor[100];
std::memset(descriptor, 0, 100);
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
{
panphasia_descriptor d(descriptor_string_);
int level_p = d.wn_level_base + lextra_;
lstate[mythread].layer_min = 0;
lstate[mythread].layer_max = level_p;
lstate[mythread].indep_field = 1;
long long ix_rel[3];
ix_rel[0] = 0; //ileft_corner_p[0];
ix_rel[1] = 0; //ileft_corner_p[1];
ix_rel[2] = 0; //ileft_corner_p[2];
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
&verbosity);
}
if (verbosity)
t1 = get_wtime();
//***************************************************************
// Process Panphasia values: p110, p011, p101, p111
//****************************************************************
std::array<double,9> cell_prop;
pan_state_ *ps = &lstate[mythread];
#pragma omp for //nowait
for (size_t i = 0; i < g1.size(0); i += 2)
{
const int ixmax(std::min<int>(2,g1.size(0)-i));
for (size_t j = 0; j < g1.size(1); j += 2)
{
const int iymax(std::min<int>(2,g1.size(1)-j));
for (size_t k = 0; k < g1.size(2); k += 2)
{
const int izmax(std::min<int>(2,g1.size(2)-k));
// ARJ - added inner set of loops to speed up evaluation of Panphasia
for (int ix = 0; ix < ixmax; ++ix)
{
for (int iy = 0; iy < iymax; ++iy)
{
for (int iz = 0; iz < izmax; ++iz)
{
int ilocal = i + ix;
int jlocal = j + iy;
int klocal = k + iz;
int iglobal = ilocal + g1.local_0_start_;
int jglobal = jlocal;
int kglobal = klocal;
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
}
}
}
}
}
}
} // end omp parallel region
// music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
// tp = get_wtime();
/////////////////////////////////////////////////////////////////////////
// transform and convolve with Legendres
g1.FourierTransformForward();
g2.FourierTransformForward();
g3.FourierTransformForward();
g4.FourierTransformForward();
#pragma omp parallel for
for (size_t i = 0; i < g0.size(0); i++)
{
for (size_t j = 0; j < g0.size(1); j++)
{
for (size_t k = 0; k < g0.size(2); k++)
{
if (!g0.is_nyquist_mode(i, j, k))
{
auto kvec = g0.get_k<real_t>(i, j, k);
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
auto fx = real_t(sinc(argx));
auto gx = ccomplex_t(0.0, dsinc(argx));
auto fy = real_t(sinc(argy));
auto gy = ccomplex_t(0.0, dsinc(argy));
auto fz = real_t(sinc(argz));
auto gz = ccomplex_t(0.0, dsinc(argz));
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
// do final phase shift to account for corner centered coordinates vs. cell centers
auto phase_shift = - 0.5 * M_PI * ( kvec[0] / g0.kny_[0]
+ kvec[1] /g0.kny_[1] + kvec[2] / g0.kny_[2]);
g0.kelem(i, j, k) *= std::exp( ccomplex_t(0,phase_shift) );
}
}
}
}
g1.reset();
g2.reset();
g3.reset();
g4.reset();
g.allocate();
g0.FourierInterpolateCopyTo( g );
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
}
};
void RNG_panphasia::Run_Panphasia_Highorder(Grid_FFT<real_t> &g)
{
int verbose = 0;
int error;
size_t x0 = 0, y0 = 0, z0 = 0;
size_t rel_level;
int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
//char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose);
printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_);
// Choose smallest value of level to equal of exceed grid_res_)
for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size <<rel_level) < grid_res_; rel_level++);
printf("Setting relative level = %lu\n", rel_level);
if ((error = PANPHASIA2::PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)))
{
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
abort();
};
//======================= FFTW ==============================
ptrdiff_t alloc_local, local_n0, local_0_start;
size_t N0 = fdim * (PANPHASIA2::descriptor_base_size << rel_level);
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start);
Grid_FFT<real_t> pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}});
_unused(alloc_local);
assert(pan_grid.n_[0] == N0);
assert(pan_grid.n_[1] == N0);
assert(pan_grid.n_[2] == N0);
assert(pan_grid.local_0_start_ == local_0_start);
assert(pan_grid.local_0_size_ == local_n0);
assert(pan_grid.ntot_ == size_t(alloc_local));
pan_grid.FourierTransformForward(false);
FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast<FFTW_COMPLEX *>(&pan_grid.data_[0]);
// Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field)))
{
music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl;
};
pan_grid.FourierInterpolateCopyTo(g);
}
namespace
{
RNG_plugin_creator_concrete<RNG_panphasia> creator("PANPHASIA");
}
#endif // defined(USE_PANPHASIA)
#endif // defined(USE_PANPHASIA)