mirror of
https://github.com/cosmo-sims/monofonIC.git
synced 2024-09-19 17:03:45 +02:00
Merged in develop (pull request #24)
Merge Panphasia development into master branch
This commit is contained in:
commit
114b81051e
35 changed files with 10250 additions and 529 deletions
|
@ -149,8 +149,11 @@ elseif("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU")
|
||||||
endif()
|
endif()
|
||||||
endif(ENABLE_PANPHASIA)
|
endif(ENABLE_PANPHASIA)
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
|
# PANPHASIA HO (High-Order, new version)
|
||||||
|
# option(ENABLE_PANPHASIA_HO "Enable PANPHASIA-HO random number generator" ON)
|
||||||
|
########################################################################################################################
|
||||||
# INCLUDES
|
# INCLUDES
|
||||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/external)
|
||||||
|
|
||||||
# SOURCES
|
# SOURCES
|
||||||
# get all the *.cc files in the subfolders
|
# get all the *.cc files in the subfolders
|
||||||
|
@ -170,7 +173,12 @@ if(ENABLE_PANPHASIA)
|
||||||
list (APPEND SOURCES
|
list (APPEND SOURCES
|
||||||
${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f
|
${PROJECT_SOURCE_DIR}/external/panphasia/panphasia_routines.f
|
||||||
${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90
|
${PROJECT_SOURCE_DIR}/external/panphasia/generic_lecuyer.f90
|
||||||
|
#${PROJECT_SOURCE_DIR}/external/panphasia_ho/main.c
|
||||||
|
${PROJECT_SOURCE_DIR}/external/panphasia_ho/high_order_panphasia_routines.c
|
||||||
|
${PROJECT_SOURCE_DIR}/external/panphasia_ho/pan_mpi_routines.c
|
||||||
|
${PROJECT_SOURCE_DIR}/external/panphasia_ho/uniform_rand_threefry4x64.c
|
||||||
)
|
)
|
||||||
|
# target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# project configuration header
|
# project configuration header
|
||||||
|
@ -246,6 +254,10 @@ if(ENABLE_PANPHASIA)
|
||||||
target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA")
|
target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA")
|
||||||
endif(ENABLE_PANPHASIA)
|
endif(ENABLE_PANPHASIA)
|
||||||
|
|
||||||
|
# if(ENABLE_PANPHASIA_HO)
|
||||||
|
# target_compile_definitions(${PRGNAME} PRIVATE "USE_PANPHASIA_HO")
|
||||||
|
# endif(ENABLE_PANPHASIA_HO)
|
||||||
|
|
||||||
if(ENABLE_PLT)
|
if(ENABLE_PLT)
|
||||||
target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT")
|
target_compile_definitions(${PRGNAME} PRIVATE "ENABLE_PLT")
|
||||||
endif(ENABLE_PLT)
|
endif(ENABLE_PLT)
|
||||||
|
|
11
example.conf
11
example.conf
|
@ -16,7 +16,7 @@ LPTorder = 3 # order of the LPT to be used (1,2 or 3)
|
||||||
DoBaryons = no # also do baryon ICs?
|
DoBaryons = no # also do baryon ICs?
|
||||||
DoBaryonVrel = no # if doing baryons, incl. also relative velocity to linear order?
|
DoBaryonVrel = no # if doing baryons, incl. also relative velocity to linear order?
|
||||||
|
|
||||||
DoFixing = yes # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
|
DoFixing = no # do mode fixing à la Angulo&Pontzen (https://arxiv.org/abs/1603.05253)
|
||||||
DoInversion = no # invert phases (for paired simulations)
|
DoInversion = no # invert phases (for paired simulations)
|
||||||
|
|
||||||
ParticleLoad = sc # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x)
|
ParticleLoad = sc # particle load, can be 'sc' (1x), 'bcc' (2x) or 'fcc' (4x)
|
||||||
|
@ -89,15 +89,16 @@ ztarget = 2.5 # target redshift for CLASS module, output at
|
||||||
##> NGenIC compatible random number generator module compatible with V. Springel's original code
|
##> NGenIC compatible random number generator module compatible with V. Springel's original code
|
||||||
## (https://www.h-its.org/2014/11/05/ngenic-code/) as well as the 2LPT code by Pueblas&Scoccmiarro
|
## (https://www.h-its.org/2014/11/05/ngenic-code/) as well as the 2LPT code by Pueblas&Scoccmiarro
|
||||||
## (https://cosmo.nyu.edu/roman/2LPT/)
|
## (https://cosmo.nyu.edu/roman/2LPT/)
|
||||||
generator = NGENIC
|
# generator = NGENIC
|
||||||
seed = 12345
|
# seed = 12345
|
||||||
|
|
||||||
##> The PANPHASIA generator uses a plugin based on original code by A. Jenkins
|
##> The PANPHASIA generator uses a plugin based on original code by A. Jenkins
|
||||||
## Warning: Before using this module, please make sure you read and agree to the distinct license
|
## Warning: Before using this module, please make sure you read and agree to the distinct license
|
||||||
## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php
|
## requirements by registering on the website http://icc.dur.ac.uk/Panphasia.php
|
||||||
|
|
||||||
# generator = PANPHASIA
|
generator = PANPHASIA_HO
|
||||||
# descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
|
#descriptor = [Panph1,L10,(800,224,576),S9,CH1564365824,MXXL]
|
||||||
|
descriptor = [Panph6,L20,(424060,82570,148256),S1,CH-999,Auriga_100_vol2]
|
||||||
# PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512)
|
# PanphasiaMinRootResolution = 512 # requires the white noise reallisation to be made at least at that resolution (default is 512)
|
||||||
|
|
||||||
##> The MUSIC1 multi-scale random number generator is provided for convenience
|
##> The MUSIC1 multi-scale random number generator is provided for convenience
|
||||||
|
|
35
external/panphasia_ho/LICENSE
vendored
Normal file
35
external/panphasia_ho/LICENSE
vendored
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
The code in this subdirectory is part of Adrian Jenkins' PANPHASIA packet,
|
||||||
|
obtained from here http://icc.dur.ac.uk/Panphasia.php
|
||||||
|
|
||||||
|
PANPHASIA is not published under the GPL but has its own proprietary license,
|
||||||
|
make sure to visit the website before using the PANPHASIA functionality of
|
||||||
|
MUSIC2 and register your name.
|
||||||
|
|
||||||
|
We reproduce the licensing requirements for PANPHASIA from the above website
|
||||||
|
as retrieved on 2020/08/23:
|
||||||
|
|
||||||
|
We make our software available for free but with a licence that includes the
|
||||||
|
condition that users make sure the phases of any new simulation volumes set up
|
||||||
|
using Panphasia are published.
|
||||||
|
|
||||||
|
We are happy to collaborate with others on improving the software and providing
|
||||||
|
support for languages other than fortran. Contact: A.R.Jenkins@durham.ac.uk
|
||||||
|
|
||||||
|
LICENCE:
|
||||||
|
|
||||||
|
You are licensed to use this software free of charge on condition that:
|
||||||
|
|
||||||
|
- you will publish the phase descriptors and reference Jenkins (13) for any new
|
||||||
|
simulations that use Panphasia phases. You will pass on this condition to others
|
||||||
|
for any software or data you make available publically or privately that makes
|
||||||
|
use of Panphasia.
|
||||||
|
- that you will ensure any publications using results derived from Panphasia will
|
||||||
|
be submitted as a final version to arXiv prior to or coincident with publication
|
||||||
|
in a journal.
|
||||||
|
- that you report any bugs in this software as soon as confirmed to
|
||||||
|
A.R.Jenkins@durham.ac.uk
|
||||||
|
- that you understand that the software comes with no warranty and that is your
|
||||||
|
responsibility to ensure that it is suitable for the purpose that you intend.
|
||||||
|
- that you agree to having your name and email address stored for an indefinite
|
||||||
|
period in the future electronically in a database as a record that you agreed
|
||||||
|
the licence conditions.
|
61
external/panphasia_ho/PAN_FFTW3.h
vendored
Normal file
61
external/panphasia_ho/PAN_FFTW3.h
vendored
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
// Define macros for FFTW3 to allow swapping
|
||||||
|
// between single/double precision FTs
|
||||||
|
|
||||||
|
// include CMake controlled configuration settings
|
||||||
|
#pragma once
|
||||||
|
#include "cmake_config.hh"
|
||||||
|
|
||||||
|
#if defined(USE_PRECISION_DOUBLE)
|
||||||
|
#define FOURIER_DOUBLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(USE_PRECISION_LONGDOUBLE)
|
||||||
|
#error "PANPHASIA-high-order does not currently support long double precision"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef FOURIER_DOUBLE
|
||||||
|
#define FFTW_REAL double
|
||||||
|
#define FFTW_PLAN fftw_plan
|
||||||
|
#define FFTW_DESTROY_PLAN fftw_destroy_plan
|
||||||
|
#define FFTW_COMPLEX fftw_complex
|
||||||
|
#define FFTW_MALLOC fftw_malloc
|
||||||
|
#define FFTW_PLAN_DFT_1D fftw_plan_dft_1d
|
||||||
|
#define FFTW_PLAN_dft_3D fftw_plan_dft_3d
|
||||||
|
#define FFTW_EXECUTE fftw_execute
|
||||||
|
#define FFTW_DESTROY_PLAN fftw_destroy_plan
|
||||||
|
#define FFTW_FREE fftw_free
|
||||||
|
#define FFTW_ALLOC_COMPLEX fftw_alloc_complex
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_MANY fftw_mpi_local_size_many
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftw_mpi_local_size_many_transposed
|
||||||
|
#define FFTW_MPI_PLAN_MANY_TRANSPOSE fftw_mpi_plan_many_transpose
|
||||||
|
#define FFTW_MPI_EXECUTE_R2R fftw_mpi_execute_r2r
|
||||||
|
#define FFTW_PLAN_MANY_DFT fftw_plan_many_dft
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_3D fftw_mpi_local_size_3d
|
||||||
|
#define FFTW_MPI_PLAN_MANY_DTF fftw_mpi_plan_many_dft
|
||||||
|
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftw_mpi_plan_many_dft_r2c
|
||||||
|
#define FFTW_MPI_EXECUTE_DFT fftw_mpi_execute_dft
|
||||||
|
#define FFTW_MPI_EXECUTE_DFT_R2C fftw_mpi_execute_dft_r2c
|
||||||
|
#else
|
||||||
|
#define FFTW_REAL float
|
||||||
|
#define FFTW_PLAN fftwf_plan
|
||||||
|
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
|
||||||
|
#define FFTW_COMPLEX fftwf_complex
|
||||||
|
#define FFTW_MALLOC fftwf_malloc
|
||||||
|
#define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d
|
||||||
|
#define FFTW_PLAN_dft_3D fftwf_plan_dft_3d
|
||||||
|
#define FFTW_EXECUTE fftwf_execute
|
||||||
|
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
|
||||||
|
#define FFTW_FREE fftwf_free
|
||||||
|
#define FFTW_ALLOC_COMPLEX fftwf_alloc_complex
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_MANY fftwf_mpi_local_size_many
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED fftwf_mpi_local_size_many_transposed
|
||||||
|
#define FFTW_MPI_PLAN_MANY_TRANSPOSE fftwf_mpi_plan_many_transpose
|
||||||
|
#define FFTW_MPI_EXECUTE_R2R fftwf_mpi_execute_r2r
|
||||||
|
#define FFTW_PLAN_MANY_DFT fftwf_plan_many_dft
|
||||||
|
#define FFTW_MPI_LOCAL_SIZE_3D fftwf_mpi_local_size_3d
|
||||||
|
#define FFTW_MPI_PLAN_MANY_DTF fftwf_mpi_plan_many_dft
|
||||||
|
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftwf_mpi_plan_many_dft_r2c
|
||||||
|
#define FFTW_MPI_EXECUTE_DFT fftwf_mpi_execute_dft
|
||||||
|
#define FFTW_MPI_EXECUTE_DFT_R2C fftwf_mpi_execute_dft_r2c
|
||||||
|
#endif
|
143
external/panphasia_ho/README
vendored
Normal file
143
external/panphasia_ho/README
vendored
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
|
||||||
|
|
||||||
|
modules on COSMA7
|
||||||
|
|
||||||
|
intel_comp/2018 fftw/3.3.9cosma7
|
||||||
|
intel_mpi/2018 gsl/2.5
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
The code calls a function to generate the k-space modes for
|
||||||
|
a portion of the Panphasia field given an input descriptor.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Should be called early before significant memory is allocated. It
|
||||||
|
uses quite a bit of memory itself, but tidies up afterwards.
|
||||||
|
|
||||||
|
Has OpenMP - -DUSE_OPENMP in the makefile
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
The routines support both single and double precision
|
||||||
|
calculations in two senses.
|
||||||
|
|
||||||
|
The Fourier computations can be single or double precision
|
||||||
|
|
||||||
|
MACROs FFTW_REAL/FFTW_COMPLEX used to define 'Fourier' precision types
|
||||||
|
float or double.
|
||||||
|
|
||||||
|
The Panphasia coefficients can be single or double precision
|
||||||
|
|
||||||
|
MACROs PAN_REAL/PAN_COMPLEX define the Panphasia precision - either
|
||||||
|
float or double.
|
||||||
|
|
||||||
|
|
||||||
|
To change the Fourier precision edit PAN_FFTW3.h - by default
|
||||||
|
single precision unless 'FOURIER_DOUBLE' is defined.
|
||||||
|
|
||||||
|
To change the Panphasia precision edit panphasia_functions.h and
|
||||||
|
single precision unless 'PAN_DOUBLE_PRECISION' is defined.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Code description
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
makefile
|
||||||
|
|
||||||
|
CODE
|
||||||
|
----
|
||||||
|
|
||||||
|
|
||||||
|
main.c - demo program only
|
||||||
|
|
||||||
|
pan_mpi_routines.c - contains MPI calls
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
high_order_panphasia_routines.c - serial - contains some OpenMP
|
||||||
|
uniform_rand_threefry4x64.c - serial - random generator and tests
|
||||||
|
|
||||||
|
Include files
|
||||||
|
--------------
|
||||||
|
|
||||||
|
panphasia_functions.h
|
||||||
|
|
||||||
|
PAN_FFTW3.h - MACROS for single/double precision FTs
|
||||||
|
|
||||||
|
pan_matrices_order6.h - matrix coefficients for 6th order scheme.
|
||||||
|
|
||||||
|
|
||||||
|
threefry.h - Random generator
|
||||||
|
array.h + features array .h files
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Development notes:
|
||||||
|
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
14th April 2021
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Found a bug in the OpenMP version. Different numbers
|
||||||
|
of threads led to a subset of Fourier modes having
|
||||||
|
different values. The precise differences changed
|
||||||
|
each time the code was run.
|
||||||
|
|
||||||
|
Debugged by turning of OpenMP section by section.
|
||||||
|
The section which uses the spherical bessel functions
|
||||||
|
turned out to be responsible.
|
||||||
|
|
||||||
|
The faulty version collapsed for 4 loops over
|
||||||
|
multipole,x,y,z. Changing this to a loop
|
||||||
|
over multipoles, and collapsing 3 coordinate
|
||||||
|
loops solved the problem.
|
||||||
|
|
||||||
|
The variable index1 of the return field
|
||||||
|
does not depend on the multipole, while
|
||||||
|
index2 does. Both index1 and index2 are
|
||||||
|
private. This means the return array
|
||||||
|
(index 1) is updated several times.
|
||||||
|
Presumably as these updated occur
|
||||||
|
in parallel with the 4 loop collapsed
|
||||||
|
version the return array was being
|
||||||
|
corrupted sometimes.
|
||||||
|
|
||||||
|
|
||||||
|
15th April
|
||||||
|
------------
|
||||||
|
|
||||||
|
This version supercedes version given to
|
||||||
|
Oliver to add to MonofonIC clone.
|
||||||
|
|
||||||
|
Main difference is additional OpenMP
|
||||||
|
statements and the ability to specify
|
||||||
|
in the descriptor that modes less
|
||||||
|
than of equal to some dimensionless
|
||||||
|
integer wavenumber squared are set
|
||||||
|
to the mean power.
|
||||||
|
|
||||||
|
Tested output on 1 core - with/without
|
||||||
|
OpenMP. Not tested with more than
|
||||||
|
1 MPI rank.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
326
external/panphasia_ho/array.h
vendored
Normal file
326
external/panphasia_ho/array.h
vendored
Normal file
|
@ -0,0 +1,326 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef _r123array_dot_h__
|
||||||
|
#define _r123array_dot_h__
|
||||||
|
#include "features/compilerfeatures.h"
|
||||||
|
#include "features/sse.h"
|
||||||
|
|
||||||
|
#ifndef __cplusplus
|
||||||
|
#define CXXMETHODS(_N, W, T)
|
||||||
|
#define CXXOVERLOADS(_N, W, T)
|
||||||
|
#else
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <iterator>
|
||||||
|
#include <limits>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
/** @defgroup arrayNxW The r123arrayNxW classes
|
||||||
|
|
||||||
|
Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
|
||||||
|
It is functionally equivalent to the C++0x std::array<N, uintW_t>,
|
||||||
|
but does not require C++0x features or libraries.
|
||||||
|
|
||||||
|
In addition to meeting most of the requirements of a Container,
|
||||||
|
it also has a member function, incr(), which increments the zero-th
|
||||||
|
element and carrys overflows into higher indexed elements. Thus,
|
||||||
|
by using incr(), sequences of up to 2^(N*W) distinct values
|
||||||
|
can be produced.
|
||||||
|
|
||||||
|
If SSE is supported by the compiler, then the class
|
||||||
|
r123array1xm128i is also defined, in which the data member is an
|
||||||
|
array of one r123128i object.
|
||||||
|
|
||||||
|
@cond HIDDEN_FROM_DOXYGEN
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename value_type>
|
||||||
|
inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
|
||||||
|
value_type v=0;
|
||||||
|
for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
|
||||||
|
v |= ((value_type)(*p32++)) << (32*i);
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Work-alike methods and typedefs modeled on std::array:
|
||||||
|
#define CXXMETHODS(_N, W, T) \
|
||||||
|
typedef T value_type; \
|
||||||
|
typedef T* iterator; \
|
||||||
|
typedef const T* const_iterator; \
|
||||||
|
typedef value_type& reference; \
|
||||||
|
typedef const value_type& const_reference; \
|
||||||
|
typedef size_t size_type; \
|
||||||
|
typedef ptrdiff_t difference_type; \
|
||||||
|
typedef T* pointer; \
|
||||||
|
typedef const T* const_pointer; \
|
||||||
|
typedef std::reverse_iterator<iterator> reverse_iterator; \
|
||||||
|
typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
|
||||||
|
/* Boost.array has static_size. C++11 specializes tuple_size */ \
|
||||||
|
enum {static_size = _N}; \
|
||||||
|
R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
|
||||||
|
R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
|
||||||
|
R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
|
||||||
|
R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
|
||||||
|
R123_CUDA_DEVICE size_type size() const { return _N; } \
|
||||||
|
R123_CUDA_DEVICE size_type max_size() const { return _N; } \
|
||||||
|
R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
|
||||||
|
R123_CUDA_DEVICE iterator begin() { return &v[0]; } \
|
||||||
|
R123_CUDA_DEVICE iterator end() { return &v[_N]; } \
|
||||||
|
R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \
|
||||||
|
R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \
|
||||||
|
R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \
|
||||||
|
R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \
|
||||||
|
R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \
|
||||||
|
R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
|
||||||
|
R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \
|
||||||
|
R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
|
||||||
|
R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
|
||||||
|
R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
|
||||||
|
R123_CUDA_DEVICE pointer data(){ return &v[0]; } \
|
||||||
|
R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \
|
||||||
|
R123_CUDA_DEVICE reference front(){ return v[0]; } \
|
||||||
|
R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \
|
||||||
|
R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \
|
||||||
|
R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \
|
||||||
|
R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
|
||||||
|
/* CUDA3 does not have std::equal */ \
|
||||||
|
for (size_t i = 0; i < _N; ++i) \
|
||||||
|
if (v[i] != rhs.v[i]) return false; \
|
||||||
|
return true; \
|
||||||
|
} \
|
||||||
|
R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
|
||||||
|
/* CUDA3 does not have std::fill_n */ \
|
||||||
|
R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
|
||||||
|
R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
|
||||||
|
/* CUDA3 does not have std::swap_ranges */ \
|
||||||
|
for (size_t i = 0; i < _N; ++i) { \
|
||||||
|
T tmp = v[i]; \
|
||||||
|
v[i] = rhs.v[i]; \
|
||||||
|
rhs.v[i] = tmp; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \
|
||||||
|
/* This test is tricky because we're trying to avoid spurious \
|
||||||
|
complaints about illegal shifts, yet still be compile-time \
|
||||||
|
evaulated. */ \
|
||||||
|
if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
|
||||||
|
return incr_carefully(n); \
|
||||||
|
if(n==1){ \
|
||||||
|
++v[0]; \
|
||||||
|
if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \
|
||||||
|
}else{ \
|
||||||
|
v[0] += n; \
|
||||||
|
if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \
|
||||||
|
} \
|
||||||
|
/* We expect that the N==?? tests will be \
|
||||||
|
constant-folded/optimized away by the compiler, so only the \
|
||||||
|
overflow tests (!!v[i]) remain to be done at runtime. For \
|
||||||
|
small values of N, it would be better to do this as an \
|
||||||
|
uncondtional sequence of adc. An experiment/optimization \
|
||||||
|
for another day... \
|
||||||
|
N.B. The weird subscripting: v[_N>3?3:0] is to silence \
|
||||||
|
a spurious error from icpc \
|
||||||
|
*/ \
|
||||||
|
++v[_N>1?1:0]; \
|
||||||
|
if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
|
||||||
|
++v[_N>2?2:0]; \
|
||||||
|
if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \
|
||||||
|
++v[_N>3?3:0]; \
|
||||||
|
for(size_t i=4; i<_N; ++i){ \
|
||||||
|
if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \
|
||||||
|
++v[i]; \
|
||||||
|
} \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
/* seed(SeedSeq) would be a constructor if having a constructor */ \
|
||||||
|
/* didn't cause headaches with defaults */ \
|
||||||
|
template <typename SeedSeq> \
|
||||||
|
R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \
|
||||||
|
r123array##_N##x##W ret; \
|
||||||
|
const size_t Ngen = _N*((3+sizeof(value_type))/4); \
|
||||||
|
uint32_t u32[Ngen]; \
|
||||||
|
uint32_t *p32 = &u32[0]; \
|
||||||
|
ss.generate(&u32[0], &u32[Ngen]); \
|
||||||
|
for(size_t i=0; i<_N; ++i){ \
|
||||||
|
ret.v[i] = assemble_from_u32<value_type>(p32); \
|
||||||
|
p32 += (3+sizeof(value_type))/4; \
|
||||||
|
} \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
protected: \
|
||||||
|
R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
|
||||||
|
/* n may be greater than the maximum value of a single value_type */ \
|
||||||
|
value_type vtn; \
|
||||||
|
vtn = n; \
|
||||||
|
v[0] += n; \
|
||||||
|
const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
|
||||||
|
for(size_t i=1; i<_N; ++i){ \
|
||||||
|
if(rshift){ \
|
||||||
|
n >>= rshift; \
|
||||||
|
}else{ \
|
||||||
|
n=0; \
|
||||||
|
} \
|
||||||
|
if( v[i-1] < vtn ) \
|
||||||
|
++n; \
|
||||||
|
if( n==0 ) break; \
|
||||||
|
vtn = n; \
|
||||||
|
v[i] += n; \
|
||||||
|
} \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
|
||||||
|
|
||||||
|
// There are several tricky considerations for the insertion and extraction
|
||||||
|
// operators:
|
||||||
|
// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
|
||||||
|
// not as 16 bytes.
|
||||||
|
// - we would like to be able to print r123array1xm128i.
|
||||||
|
// - we do not want an int conversion operator in r123m128i because it causes
|
||||||
|
// lots of ambiguity problems with automatic promotions.
|
||||||
|
// Solution: r123arrayinsertable and r123arrayextractable
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct r123arrayinsertable{
|
||||||
|
const T& v;
|
||||||
|
r123arrayinsertable(const T& t_) : v(t_) {}
|
||||||
|
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
|
||||||
|
return os << t.v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
struct r123arrayinsertable<uint8_t>{
|
||||||
|
const uint8_t& v;
|
||||||
|
r123arrayinsertable(const uint8_t& t_) : v(t_) {}
|
||||||
|
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
|
||||||
|
return os << (int)t.v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct r123arrayextractable{
|
||||||
|
T& v;
|
||||||
|
r123arrayextractable(T& t_) : v(t_) {}
|
||||||
|
friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
|
||||||
|
return is >> t.v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
struct r123arrayextractable<uint8_t>{
|
||||||
|
uint8_t& v;
|
||||||
|
r123arrayextractable(uint8_t& t_) : v(t_) {}
|
||||||
|
friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
|
||||||
|
int i;
|
||||||
|
is >> i;
|
||||||
|
t.v = i;
|
||||||
|
return is;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define CXXOVERLOADS(_N, W, T) \
|
||||||
|
\
|
||||||
|
inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \
|
||||||
|
os << r123arrayinsertable<T>(a.v[0]); \
|
||||||
|
for(size_t i=1; i<_N; ++i) \
|
||||||
|
os << " " << r123arrayinsertable<T>(a.v[i]); \
|
||||||
|
return os; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \
|
||||||
|
for(size_t i=0; i<_N; ++i){ \
|
||||||
|
r123arrayextractable<T> x(a.v[i]); \
|
||||||
|
is >> x; \
|
||||||
|
} \
|
||||||
|
return is; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
namespace r123{ \
|
||||||
|
typedef r123array##_N##x##W Array##_N##x##W; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
|
||||||
|
|
||||||
|
In C, it's nothing more than a struct containing an array of N
|
||||||
|
objects of type T.
|
||||||
|
|
||||||
|
In C++ it's the same, but endowed with an assortment of member
|
||||||
|
functions, typedefs and friends. In C++, r123arrayNxW looks a lot
|
||||||
|
like std::array<T,N>, has most of the capabilities of a container,
|
||||||
|
and satisfies the requirements outlined in compat/Engine.hpp for
|
||||||
|
counter and key types. ArrayNxW, in the r123 namespace is
|
||||||
|
a typedef equivalent to r123arrayNxW.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define _r123array_tpl(_N, W, T) \
|
||||||
|
/** @ingroup arrayNxW */ \
|
||||||
|
/** @see arrayNxW */ \
|
||||||
|
struct r123array##_N##x##W{ \
|
||||||
|
T v[_N]; \
|
||||||
|
CXXMETHODS(_N, W, T) \
|
||||||
|
}; \
|
||||||
|
\
|
||||||
|
CXXOVERLOADS(_N, W, T)
|
||||||
|
|
||||||
|
/** @endcond */
|
||||||
|
|
||||||
|
_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */
|
||||||
|
_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
|
||||||
|
_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
|
||||||
|
_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
|
||||||
|
|
||||||
|
_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
|
||||||
|
_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
|
||||||
|
_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
|
||||||
|
|
||||||
|
_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
|
||||||
|
|
||||||
|
#if R123_USE_SSE
|
||||||
|
_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* In C++, it's natural to use sizeof(a::value_type), but in C it's
|
||||||
|
pretty convoluted to figure out the width of the value_type of an
|
||||||
|
r123arrayNxW:
|
||||||
|
*/
|
||||||
|
#define R123_W(a) (8*sizeof(((a *)0)->v[0]))
|
||||||
|
|
||||||
|
/** @namespace r123
|
||||||
|
Most of the Random123 C++ API is contained in the r123 namespace.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
33
external/panphasia_ho/c7_script_threaded
vendored
Executable file
33
external/panphasia_ho/c7_script_threaded
vendored
Executable file
|
@ -0,0 +1,33 @@
|
||||||
|
#!/bin/bash -l
|
||||||
|
|
||||||
|
#SBATCH --ntasks 5
|
||||||
|
#SBATCH -J Test_MPI_FFTW
|
||||||
|
#SBATCH -o standard_output_file.%J.out
|
||||||
|
#SBATCH -e standard_error_file.%J.err
|
||||||
|
#SBATCH -p cosma7
|
||||||
|
#SBATCH -A dp004
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH -t 00:05:00
|
||||||
|
#SBATCH --mail-type=END # notifications for job
|
||||||
|
#SBATCH --mail-user=a.r.jenkins@durham.ac.uk
|
||||||
|
|
||||||
|
module purge
|
||||||
|
module load intel_comp/2018 intel_mpi/2018 fftw/3.3.9cosma7 gsl/2.5 hdf5/1.8.20
|
||||||
|
|
||||||
|
|
||||||
|
# Run the program
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
mpirun -l -env I_MPI_PIN=1 -env I_MPI_PIN_PROCESSOR_LIST=allcores -n $SLURM_NTASKS ./pan_fftw3_test_code.x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
93
external/panphasia_ho/features/clangfeatures.h
vendored
Normal file
93
external/panphasia_ho/features/clangfeatures.h
vendored
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2016, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __clangfeatures_dot_hpp
|
||||||
|
#define __clangfeatures_dot_hpp
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#if (defined(__x86_64__)||defined(__i386__))
|
||||||
|
#define R123_USE_X86INTRIN_H 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||||
|
#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||||
|
#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// With clang-3.6, -Wall warns about unused-local-typedefs.
|
||||||
|
// The "obvious" thing to do is to ignore -Wunused-local-typedefs,
|
||||||
|
// but that doesn't work because earlier versions of clang blow
|
||||||
|
// up on an 'unknown warning group'. So we briefly ignore -Wall...
|
||||||
|
// It's tempting to just give up on static assertions in pre-c++11 code.
|
||||||
|
#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT)
|
||||||
|
#define R123_STATIC_ASSERT(expr, msg) \
|
||||||
|
_Pragma("clang diagnostic push") \
|
||||||
|
_Pragma("clang diagnostic ignored \"-Wall\"") \
|
||||||
|
typedef char static_assertion[(!!(expr))*2-1] \
|
||||||
|
_Pragma("clang diagnostic pop")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||||
|
#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||||
|
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// With clang-3.0, the apparently simpler:
|
||||||
|
// #define R123_USE_CXX11_RANDOM __has_include(<random>)
|
||||||
|
// dumps core.
|
||||||
|
#ifndef R123_USE_CXX11_RANDOM
|
||||||
|
#if __cplusplus>=201103L && __has_include(<random>)
|
||||||
|
#define R123_USE_CXX11_RANDOM 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_CXX11_RANDOM 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||||
|
#if __cplusplus>=201103L && __has_include(<type_traits>)
|
||||||
|
#define R123_USE_CXX11_TYPE_TRAITS 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_CXX11_TYPE_TRAITS 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "gccfeatures.h"
|
||||||
|
|
||||||
|
#endif
|
343
external/panphasia_ho/features/compilerfeatures.h
vendored
Normal file
343
external/panphasia_ho/features/compilerfeatures.h
vendored
Normal file
|
@ -0,0 +1,343 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
|
||||||
|
@page porting Preprocessor symbols for porting Random123 to different platforms.
|
||||||
|
|
||||||
|
The Random123 library is portable across C, C++, CUDA, OpenCL environments,
|
||||||
|
and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
|
||||||
|
This level of portability requires the abstraction of some features
|
||||||
|
and idioms that are either not standardized (e.g., asm statments), or for which
|
||||||
|
different vendors have their own standards (e.g., SSE intrinsics) or for
|
||||||
|
which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
|
||||||
|
|
||||||
|
Random123/features/compilerfeatures.h
|
||||||
|
conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
|
||||||
|
defines appropriate values for the preprocessor symbols which can be used with
|
||||||
|
a specific compiler or OS. Those symbols will then
|
||||||
|
be used by other header files and source files in the Random123
|
||||||
|
library (and may be used by applications) to control what actually
|
||||||
|
gets presented to the compiler.
|
||||||
|
|
||||||
|
Most of the symbols are boolean valued. In general, they will
|
||||||
|
\b always be defined with value either 1 or 0, so do
|
||||||
|
\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead.
|
||||||
|
|
||||||
|
Library users can override any value by defining the pp-symbol with a compiler option,
|
||||||
|
e.g.,
|
||||||
|
|
||||||
|
cc -DR123_USE_MULHILO64_C99
|
||||||
|
|
||||||
|
will use a strictly c99 version of the full-width 64x64->128-bit multiplication
|
||||||
|
function, even if it would be disabled by default.
|
||||||
|
|
||||||
|
All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
|
||||||
|
@verbatim
|
||||||
|
AES_NI
|
||||||
|
AES_OPENSSL
|
||||||
|
SSE4_2
|
||||||
|
SSE4_1
|
||||||
|
SSE
|
||||||
|
|
||||||
|
STD_RANDOM
|
||||||
|
|
||||||
|
GNU_UINT128
|
||||||
|
ASM_GNU
|
||||||
|
ASM_MSASM
|
||||||
|
|
||||||
|
CPUID_MSVC
|
||||||
|
|
||||||
|
CXX11_RANDOM
|
||||||
|
CXX11_TYPE_TRAITS
|
||||||
|
CXX11_STATIC_ASSERT
|
||||||
|
CXX11_CONSTEXPR
|
||||||
|
CXX11_UNRESTRICTED_UNIONS
|
||||||
|
CXX11_EXPLICIT_CONVERSIONS
|
||||||
|
CXX11_LONG_LONG
|
||||||
|
CXX11_STD_ARRAY
|
||||||
|
CXX11
|
||||||
|
|
||||||
|
X86INTRIN_H
|
||||||
|
IA32INTRIN_H
|
||||||
|
XMMINTRIN_H
|
||||||
|
EMMINTRIN_H
|
||||||
|
SMMINTRIN_H
|
||||||
|
WMMINTRIN_H
|
||||||
|
INTRIN_H
|
||||||
|
|
||||||
|
MULHILO32_ASM
|
||||||
|
MULHILO64_ASM
|
||||||
|
MULHILO64_MSVC_INTRIN
|
||||||
|
MULHILO64_CUDA_INTRIN
|
||||||
|
MULHILO64_OPENCL_INTRIN
|
||||||
|
MULHILO64_C99
|
||||||
|
|
||||||
|
U01_DOUBLE
|
||||||
|
|
||||||
|
@endverbatim
|
||||||
|
Most have obvious meanings. Some non-obvious ones:
|
||||||
|
|
||||||
|
AES_NI and AES_OPENSSL are not mutually exclusive. You can have one,
|
||||||
|
both or neither.
|
||||||
|
|
||||||
|
GNU_UINT128 says that it's safe to use __uint128_t, but it
|
||||||
|
does not require its use. In particular, it should be
|
||||||
|
used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
|
||||||
|
|
||||||
|
If the XXXINTRIN_H macros are true, then one should
|
||||||
|
@code
|
||||||
|
#include <xxxintrin.h>
|
||||||
|
@endcode
|
||||||
|
to gain accesss to compiler intrinsics.
|
||||||
|
|
||||||
|
The CXX11_SOME_FEATURE macros allow the code to use specific
|
||||||
|
features of the C++11 language and library. The catchall
|
||||||
|
In the absence of a specific CXX11_SOME_FEATURE, the feature
|
||||||
|
is controlled by the catch-all R123_USE_CXX11 macro.
|
||||||
|
|
||||||
|
U01_DOUBLE defaults on, and can be turned off (set to 0)
|
||||||
|
if one does not want the utility functions that convert to double
|
||||||
|
(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
|
||||||
|
|
||||||
|
There are a number of invariants that are always true. Application code may
|
||||||
|
choose to rely on these:
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>ASM_GNU and ASM_MASM are mutually exclusive
|
||||||
|
<li>The "higher" SSE values imply the lower ones.
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
There are also non-boolean valued symbols:
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>R123_STATIC_INLINE -
|
||||||
|
According to both C99 and GNU99, the 'static inline' declaration allows
|
||||||
|
the compiler to not emit code if the function is not used.
|
||||||
|
Note that the semantics of 'inline', 'static' and 'extern' in
|
||||||
|
gcc have changed over time and are subject to modification by
|
||||||
|
command line options, e.g., -std=gnu89, -fgnu-inline.
|
||||||
|
Nevertheless, it appears that the meaning of 'static inline'
|
||||||
|
has not changed over time and (with a little luck) the use of 'static inline'
|
||||||
|
here will be portable between versions of gcc and to other C99
|
||||||
|
compilers.
|
||||||
|
See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
|
||||||
|
http://www.greenend.org.uk/rjk/2003/03/inline.html
|
||||||
|
|
||||||
|
<li>R123_FORCE_INLINE(decl) -
|
||||||
|
which expands to 'decl', adorned with the compiler-specific
|
||||||
|
embellishments to strongly encourage that the declared function be
|
||||||
|
inlined. If there is no such compiler-specific magic, it should
|
||||||
|
expand to decl, unadorned.
|
||||||
|
|
||||||
|
<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
|
||||||
|
sufficiently similar semantics) when CUDA is in use, and expands
|
||||||
|
to nothing in other cases.
|
||||||
|
|
||||||
|
<li>R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or
|
||||||
|
something else with sufficiently similar semantics) when compiling a
|
||||||
|
Metal kernel, and expands to nothing in other cases.
|
||||||
|
|
||||||
|
<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
|
||||||
|
all if we're in an environment so feature-poor that you can't even
|
||||||
|
call assert (I'm looking at you, CUDA and OpenCL), or even include
|
||||||
|
assert.h safely (OpenCL).
|
||||||
|
|
||||||
|
<li>R123_STATIC_ASSERT(expr,msg) - which expands to
|
||||||
|
static_assert(expr,msg), or to an expression that
|
||||||
|
will raise a compile-time exception if expr is not true.
|
||||||
|
|
||||||
|
<li>R123_ULONG_LONG - which expands to a declaration of the longest available
|
||||||
|
unsigned integer.
|
||||||
|
|
||||||
|
<li>R123_64BIT(x) - expands to something equivalent to
|
||||||
|
UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
|
||||||
|
is not available, e.g., MSVC and OpenCL.
|
||||||
|
|
||||||
|
<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
|
||||||
|
the semantics of gcc's __builtin_expect(expr,likely_value). If
|
||||||
|
the environment has nothing like __builtin_expect, it should expand
|
||||||
|
to just expr.
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
\cond HIDDEN_FROM_DOXYGEN
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
N.B. When something is added to the list of features, it should be
|
||||||
|
added to each of the *features.h files, AND to examples/ut_features.cpp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
|
||||||
|
#if defined(__METAL_MACOS__)
|
||||||
|
#include "metalfeatures.h"
|
||||||
|
#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
|
||||||
|
#include "openclfeatures.h"
|
||||||
|
#elif defined(__CUDACC__)
|
||||||
|
#include "nvccfeatures.h"
|
||||||
|
#elif defined(__ICC)
|
||||||
|
#include "iccfeatures.h"
|
||||||
|
#elif defined(__xlC__) || defined(__ibmxl__)
|
||||||
|
#include "xlcfeatures.h"
|
||||||
|
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
|
||||||
|
#include "sunprofeatures.h"
|
||||||
|
#elif defined(__OPEN64__)
|
||||||
|
#include "open64features.h"
|
||||||
|
#elif defined(__clang__)
|
||||||
|
#include "clangfeatures.h"
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
#include "gccfeatures.h"
|
||||||
|
#elif defined(__PGI)
|
||||||
|
#include "pgccfeatures.h"
|
||||||
|
#elif defined(_MSC_FULL_VER)
|
||||||
|
#include "msvcfeatures.h"
|
||||||
|
#else
|
||||||
|
#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp"
|
||||||
|
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11
|
||||||
|
#define R123_USE_CXX11 (__cplusplus >= 201103L)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||||
|
#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||||
|
#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||||
|
#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||||
|
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_RANDOM
|
||||||
|
#define R123_USE_CXX11_RANDOM R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||||
|
#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_LONG_LONG
|
||||||
|
#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_STD_ARRAY
|
||||||
|
#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_C99
|
||||||
|
#define R123_USE_MULHILO64_C99 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_ASSERT
|
||||||
|
#if R123_USE_CXX11_STATIC_ASSERT
|
||||||
|
#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
|
||||||
|
#else
|
||||||
|
/* if msg always_looked_like_this, we could paste it into the name. Worth it? */
|
||||||
|
#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CONSTEXPR
|
||||||
|
#if R123_USE_CXX11_CONSTEXPR
|
||||||
|
#define R123_CONSTEXPR constexpr
|
||||||
|
#else
|
||||||
|
#define R123_CONSTEXPR
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_64BIT
|
||||||
|
#define R123_USE_64BIT 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_PHILOX_64BIT
|
||||||
|
#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ULONG_LONG
|
||||||
|
#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
|
||||||
|
/* C++98 doesn't have long long. It doesn't have uint64_t either, but
|
||||||
|
we will have typedef'ed uint64_t to something in the xxxfeatures.h.
|
||||||
|
With luck, it won't elicit complaints from -pedantic. Cross your
|
||||||
|
fingers... */
|
||||||
|
#define R123_ULONG_LONG uint64_t
|
||||||
|
#else
|
||||||
|
#define R123_ULONG_LONG unsigned long long
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* UINT64_C should have been #defined by XXXfeatures.h, either by
|
||||||
|
#include <stdint.h> or through compiler-dependent hacks */
|
||||||
|
#ifndef R123_64BIT
|
||||||
|
#define R123_64BIT(x) UINT64_C(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_THROW
|
||||||
|
#define R123_THROW(x) throw (x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
|
||||||
|
#define R123_METAL_THREAD_ADDRESS_SPACE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||||
|
#define R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Windows.h (and perhaps other "well-meaning" code define min and
|
||||||
|
* max, so there's a high chance that our definition of min, max
|
||||||
|
* methods or use of std::numeric_limits min and max will cause
|
||||||
|
* complaints in any program that happened to include Windows.h or
|
||||||
|
* suchlike first. We use the null macro below in our own header
|
||||||
|
* files definition or use of min, max to defensively preclude
|
||||||
|
* this problem. It may not be enough; one might need to #define
|
||||||
|
* NOMINMAX before including Windows.h or compile with -DNOMINMAX.
|
||||||
|
*/
|
||||||
|
#define R123_NO_MACRO_SUBST
|
||||||
|
|
||||||
|
/** \endcond */
|
263
external/panphasia_ho/features/gccfeatures.h
vendored
Normal file
263
external/panphasia_ho/features/gccfeatures.h
vendored
Normal file
|
@ -0,0 +1,263 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __gccfeatures_dot_hpp
|
||||||
|
#define __gccfeatures_dot_hpp
|
||||||
|
|
||||||
|
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
|
||||||
|
|
||||||
|
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390x__)
|
||||||
|
# error "This code has only been tested on x86, powerpc and a few arm platforms."
|
||||||
|
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||||
|
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||||
|
/* Feel free to try the Random123 library on other architectures by changing
|
||||||
|
the conditions that reach this error, but you should consider it a
|
||||||
|
porting exercise and expect to encounter bugs and deficiencies.
|
||||||
|
Please let the authors know of any successes (or failures). */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __powerpc__
|
||||||
|
#include <ppu_intrinsics.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static __inline__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#if R123_GNUC_VERSION >= 40000
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||||
|
#else
|
||||||
|
#define R123_FORCE_INLINE(decl) decl
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* According to the C++0x standard, we should be able to test the numeric
|
||||||
|
value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11
|
||||||
|
But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
|
||||||
|
since early 2001, which was finally fixed in 4.7 (early 2012). For
|
||||||
|
earlier versions, the only way to detect whether --std=c++0x was requested
|
||||||
|
on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
|
||||||
|
*/
|
||||||
|
#if defined(__GCC_EXPERIMENTAL_CXX0X__)
|
||||||
|
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
|
||||||
|
#else
|
||||||
|
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||||
|
#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||||
|
#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||||
|
#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||||
|
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_RANDOM
|
||||||
|
#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||||
|
#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#ifdef __AES__
|
||||||
|
#define R123_USE_AES_NI 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#ifdef __SSE4_2__
|
||||||
|
#define R123_USE_SSE4_2 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#ifdef __SSE4_1__
|
||||||
|
#define R123_USE_SSE4_1 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
/* There's no point in trying to compile SSE code in Random123
|
||||||
|
unless SSE2 is available. */
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#define R123_USE_SSE 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
/* There isn't really a good way to tell at compile time whether
|
||||||
|
openssl is available. Without a pre-compilation configure-like
|
||||||
|
tool, it's less error-prone to guess that it isn't available. Add
|
||||||
|
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||||
|
play with openssl */
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#if defined(__x86_64__) || defined(__aarch64__)
|
||||||
|
#define R123_USE_GNU_UINT128 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#if (defined(__x86_64__)||defined(__i386__))
|
||||||
|
#define R123_USE_ASM_GNU 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_ASM_GNU 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#if (defined(__x86_64__)||defined(__i386__))
|
||||||
|
#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
|
||||||
|
#else
|
||||||
|
#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
|
||||||
|
emmintrin.h in the include search path. This is
|
||||||
|
so broken that I refuse to try to work around it. If this
|
||||||
|
affects you, figure out where your emmintrin.h lives and
|
||||||
|
add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */
|
||||||
|
#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#define R123_USE_INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||||
|
#if (defined(__powerpc64__))
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_MULHILO64_MULHI_INTRIN
|
||||||
|
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
and in ../ut_features.cpp */
|
||||||
|
#endif
|
212
external/panphasia_ho/features/iccfeatures.h
vendored
Normal file
212
external/panphasia_ho/features/iccfeatures.h
vendored
Normal file
|
@ -0,0 +1,212 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __icpcfeatures_dot_hpp
|
||||||
|
#define __icpcfeatures_dot_hpp
|
||||||
|
|
||||||
|
// icc relies on gcc libraries and other toolchain components.
|
||||||
|
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
|
||||||
|
|
||||||
|
#if !defined(__x86_64__) && !defined(__i386__)
|
||||||
|
# error "This code has only been tested on x86 platforms."
|
||||||
|
{ // maybe an unbalanced brace will terminate the compilation
|
||||||
|
// You are invited to try Easy123 on other architectures, by changing
|
||||||
|
// the conditions that reach this error, but you should consider it a
|
||||||
|
// porting exercise and expect to encounter bugs and deficiencies.
|
||||||
|
// Please let the authors know of any successes (or failures).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// The basic idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #if some condition
|
||||||
|
// #define R123_SOMETHING 1
|
||||||
|
// #else
|
||||||
|
// #define R123_SOMETHING 0
|
||||||
|
// #endif
|
||||||
|
// #endif
|
||||||
|
// This idiom allows an external user to override any decision
|
||||||
|
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||||
|
|
||||||
|
// An alternative idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #define R123_SOMETHING (some boolean expression)
|
||||||
|
// #endif
|
||||||
|
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||||
|
// pp-symbols.
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#ifdef __SSE4_2__
|
||||||
|
#define R123_USE_SSE4_2 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#ifdef __SSE4_1__
|
||||||
|
#define R123_USE_SSE4_1 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#define R123_USE_SSE 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
// Unlike gcc, icc (version 12) does not pre-define an __AES__
|
||||||
|
// pp-symbol when -maes or -xHost is on the command line. This feels
|
||||||
|
// like a defect in icc (it defines __SSE4_2__ in analogous
|
||||||
|
// circumstances), but until Intel fixes it, we're better off erring
|
||||||
|
// on the side of caution and not generating instructions that are
|
||||||
|
// going to raise SIGILL when executed. To get the AES-NI
|
||||||
|
// instructions with icc, the caller must puts something like
|
||||||
|
// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
|
||||||
|
// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
|
||||||
|
// 11.1 onwards.
|
||||||
|
//
|
||||||
|
#if defined(__AES__)
|
||||||
|
#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/)
|
||||||
|
#else
|
||||||
|
#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
/* There isn't really a good way to tell at compile time whether
|
||||||
|
openssl is available. Without a pre-compilation configure-like
|
||||||
|
tool, it's less error-prone to guess that it isn't available. Add
|
||||||
|
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||||
|
play with openssl */
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#define R123_USE_ASM_GNU 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
#define R123_USE_EMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#define R123_USE_INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO16_ASM
|
||||||
|
#define R123_USE_MULHILO16_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
// and in ../ut_features.cpp
|
||||||
|
#endif
|
111
external/panphasia_ho/features/metalfeatures.h
vendored
Normal file
111
external/panphasia_ho/features/metalfeatures.h
vendored
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Written by Tom Schoonjans <Tom.Schoonjans@me.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __metalfeatures_dot_hpp
|
||||||
|
#define __metalfeatures_dot_hpp
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
|
||||||
|
#define R123_METAL_THREAD_ADDRESS_SPACE thread
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||||
|
#define R123_METAL_CONSTANT_ADDRESS_SPACE constant
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#define R123_ASSERT(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO32_MULHI_INTRIN 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if R123_USE_MULHILO32_MULHI_INTRIN
|
||||||
|
#include <metal_integer>
|
||||||
|
#define R123_MULHILO32_MULHI_INTRIN metal::mulhi
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_64BIT
|
||||||
|
#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ULONG_LONG
|
||||||
|
/* the longest integer type in Metal (Feb 2019, Specification-2) is a
|
||||||
|
* 32-bit unsigned int. Let's hope for the best... */
|
||||||
|
#define R123_ULONG_LONG unsigned int
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
200
external/panphasia_ho/features/msvcfeatures.h
vendored
Normal file
200
external/panphasia_ho/features/msvcfeatures.h
vendored
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __msvcfeatures_dot_hpp
|
||||||
|
#define __msvcfeatures_dot_hpp
|
||||||
|
|
||||||
|
//#if _MSVC_FULL_VER <= 15
|
||||||
|
//#error "We've only tested MSVC_FULL_VER==15."
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
#if !defined(_M_IX86) && !defined(_M_X64)
|
||||||
|
# error "This code has only been tested on x86 platforms."
|
||||||
|
{ // maybe an unbalanced brace will terminate the compilation
|
||||||
|
// You are invited to try Random123 on other architectures, by changing
|
||||||
|
// the conditions that reach this error, but you should consider it a
|
||||||
|
// porting exercise and expect to encounter bugs and deficiencies.
|
||||||
|
// Please let the authors know of any successes (or failures).
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static __inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) _forceinline decl
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// The basic idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #if some condition
|
||||||
|
// #define R123_SOMETHING 1
|
||||||
|
// #else
|
||||||
|
// #define R123_SOMETHING 0
|
||||||
|
// #endif
|
||||||
|
// #endif
|
||||||
|
// This idiom allows an external user to override any decision
|
||||||
|
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||||
|
|
||||||
|
// An alternative idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #define R123_SOMETHING (some boolean expression)
|
||||||
|
// #endif
|
||||||
|
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||||
|
// pp-symbols.
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#if defined(_M_X64)
|
||||||
|
#define R123_USE_AES_NI 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#if defined(_M_X64)
|
||||||
|
#define R123_USE_SSE4_2 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#if defined(_M_X64)
|
||||||
|
#define R123_USE_SSE4_1 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
#define R123_USE_SSE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#define R123_USE_ASM_GNU 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
#define R123_USE_EMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#define R123_USE_INTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO16_ASM
|
||||||
|
#define R123_USE_MULHILO16_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#if defined(_M_X64)
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#pragma warning(disable:4244)
|
||||||
|
#pragma warning(disable:4996)
|
||||||
|
|
||||||
|
// If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
// and in ../ut_features.cpp
|
||||||
|
#endif
|
125
external/panphasia_ho/features/nvccfeatures.h
vendored
Normal file
125
external/panphasia_ho/features/nvccfeatures.h
vendored
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __r123_nvcc_features_dot_h__
|
||||||
|
#define __r123_nvcc_features_dot_h__
|
||||||
|
|
||||||
|
#if !defined(CUDART_VERSION)
|
||||||
|
#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 4010
|
||||||
|
#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
|
||||||
|
// This test was added in Random123-1.08 (August, 2013) because we
|
||||||
|
// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
|
||||||
|
// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
|
||||||
|
// earlier. We can't be sure this bug doesn't also affect invocations
|
||||||
|
// of other templated functions, e.g., essentially all of Random123.
|
||||||
|
// Thus, we no longer trust CUDA versions earlier than 4.1 even though
|
||||||
|
// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
|
||||||
|
// If you feel lucky or desperate, you can change #error to #warning, but
|
||||||
|
// please take extra care to be sure that you are getting correct
|
||||||
|
// results.
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// nvcc falls through to gcc or msvc. So first define
|
||||||
|
// a couple of things and then include either gccfeatures.h
|
||||||
|
// or msvcfeatures.h
|
||||||
|
|
||||||
|
//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
|
||||||
|
//for both device and host functions in CUDA by setting compiler flags
|
||||||
|
//for the device function
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE __device__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_THROW
|
||||||
|
// No exceptions in CUDA, at least upto 4.0
|
||||||
|
#define R123_THROW(x) R123_ASSERT(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#define R123_ASSERT(x) if((x)) ; else asm("trap;")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // ! __CUDA_ARCH__
|
||||||
|
// If we're using nvcc not compiling for the CUDA architecture,
|
||||||
|
// then we must be compiling for the host. In that case,
|
||||||
|
// tell the philox code to use the mulhilo64 asm because
|
||||||
|
// nvcc doesn't grok uint128_t.
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ULONG_LONG
|
||||||
|
// uint64_t, which is what we'd get without this, is
|
||||||
|
// not the same as unsigned long long
|
||||||
|
#define R123_ULONG_LONG unsigned long long
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__GNUC__)
|
||||||
|
#include "gccfeatures.h"
|
||||||
|
#elif defined(_MSC_FULL_VER)
|
||||||
|
#include "msvcfeatures.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
50
external/panphasia_ho/features/open64features.h
vendored
Normal file
50
external/panphasia_ho/features/open64features.h
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __open64features_dot_hpp
|
||||||
|
#define __open64features_dot_hpp
|
||||||
|
|
||||||
|
/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
|
||||||
|
|
||||||
|
/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
|
||||||
|
but produce incorrect code for 64-bit philox. The MULHILO64_ASM
|
||||||
|
seems to work fine */
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "gccfeatures.h"
|
||||||
|
|
||||||
|
#endif
|
89
external/panphasia_ho/features/openclfeatures.h
vendored
Normal file
89
external/panphasia_ho/features/openclfeatures.h
vendored
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __openclfeatures_dot_hpp
|
||||||
|
#define __openclfeatures_dot_hpp
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#define R123_ASSERT(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
|
||||||
|
// ulong to mul_hi. And gets lots of complaints from stdint.h
|
||||||
|
// on some machines.
|
||||||
|
// But these typedefs mean we cannot include stdint.h with
|
||||||
|
// these headers? Do we need R123_64T, R123_32T, R123_8T?
|
||||||
|
typedef ulong uint64_t;
|
||||||
|
typedef uint uint32_t;
|
||||||
|
typedef uchar uint8_t;
|
||||||
|
#define UINT64_C(x) ((ulong)(x##UL))
|
||||||
|
|
||||||
|
#endif
|
194
external/panphasia_ho/features/pgccfeatures.h
vendored
Normal file
194
external/panphasia_ho/features/pgccfeatures.h
vendored
Normal file
|
@ -0,0 +1,194 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
Copyright (c) 2013, Los Alamos National Security, LLC
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Copyright 2013. Los Alamos National Security, LLC. This software was produced
|
||||||
|
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
|
||||||
|
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
|
||||||
|
the U.S. Department of Energy. The U.S. Government has rights to use,
|
||||||
|
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
|
||||||
|
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
|
||||||
|
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
|
||||||
|
to produce derivative works, such modified software should be clearly marked,
|
||||||
|
so as not to confuse it with the version available from LANL.
|
||||||
|
*/
|
||||||
|
#ifndef __pgccfeatures_dot_hpp
|
||||||
|
#define __pgccfeatures_dot_hpp
|
||||||
|
|
||||||
|
#if !defined(__x86_64__) && !defined(__i386__)
|
||||||
|
# error "This code has only been tested on x86 platforms."
|
||||||
|
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||||
|
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||||
|
/* Feel free to try the Random123 library on other architectures by changing
|
||||||
|
the conditions that reach this error, but you should consider it a
|
||||||
|
porting exercise and expect to encounter bugs and deficiencies.
|
||||||
|
Please let the authors know of any successes (or failures). */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Found this example in PGI's emmintrin.h. */
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) (expr)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* PGI through 13.2 doesn't appear to support AES-NI. */
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
|
||||||
|
ABM, but not SSE4.1 or SSE4.2. */
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
/* There's no point in trying to compile SSE code in Random123
|
||||||
|
unless SSE2 is available. */
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#define R123_USE_SSE 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
/* There isn't really a good way to tell at compile time whether
|
||||||
|
openssl is available. Without a pre-compilation configure-like
|
||||||
|
tool, it's less error-prone to guess that it isn't available. Add
|
||||||
|
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||||
|
play with openssl */
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#define R123_USE_ASM_GNU 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
|
||||||
|
about undefined references to _mm_castsi128_ps(__m128i). Why? */
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
#define R123_USE_EMMINTRIN_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#ifdef __ABM__
|
||||||
|
#define R123_USE_INTRIN_H 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
and in ../ut_features.cpp */
|
||||||
|
#endif
|
280
external/panphasia_ho/features/sse.h
vendored
Normal file
280
external/panphasia_ho/features/sse.h
vendored
Normal file
|
@ -0,0 +1,280 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef _Random123_sse_dot_h__
|
||||||
|
#define _Random123_sse_dot_h__
|
||||||
|
|
||||||
|
#if R123_USE_SSE
|
||||||
|
|
||||||
|
#if R123_USE_X86INTRIN_H
|
||||||
|
#include <x86intrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_IA32INTRIN_H
|
||||||
|
#include <ia32intrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_XMMINTRIN_H
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_EMMINTRIN_H
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_SMMINTRIN_H
|
||||||
|
#include <smmintrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_WMMINTRIN_H
|
||||||
|
#include <wmmintrin.h>
|
||||||
|
#endif
|
||||||
|
#if R123_USE_INTRIN_H
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
#ifdef __cplusplus
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
#include <stdexcept>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if R123_USE_ASM_GNU
|
||||||
|
|
||||||
|
/* bit25 of CX tells us whether AES is enabled. */
|
||||||
|
R123_STATIC_INLINE int haveAESNI(){
|
||||||
|
unsigned int eax, ebx, ecx, edx;
|
||||||
|
__asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
|
||||||
|
"a" (1));
|
||||||
|
return (ecx>>25) & 1;
|
||||||
|
}
|
||||||
|
#elif R123_USE_CPUID_MSVC
|
||||||
|
R123_STATIC_INLINE int haveAESNI(){
|
||||||
|
int CPUInfo[4];
|
||||||
|
__cpuid(CPUInfo, 1);
|
||||||
|
return (CPUInfo[2]>>25)&1;
|
||||||
|
}
|
||||||
|
#else /* R123_USE_CPUID_??? */
|
||||||
|
#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
|
||||||
|
R123_STATIC_INLINE int haveAESNI(){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
|
||||||
|
|
||||||
|
// There is a lot of annoying and inexplicable variation in the
|
||||||
|
// SSE intrinsics available in different compilation environments.
|
||||||
|
// The details seem to depend on the compiler, the version and
|
||||||
|
// the target architecture. Rather than insisting on
|
||||||
|
// R123_USE_feature tests for each of these in each of the
|
||||||
|
// compilerfeatures.h files we just keep the complexity localized
|
||||||
|
// to here...
|
||||||
|
#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
|
||||||
|
/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
|
||||||
|
If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
|
||||||
|
added _mm_set_epi64x to icc version 12.1 in Jan 2012.
|
||||||
|
*/
|
||||||
|
R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
|
||||||
|
union{
|
||||||
|
uint64_t u64;
|
||||||
|
uint32_t u32[2];
|
||||||
|
} u1, u0;
|
||||||
|
u1.u64 = v1;
|
||||||
|
u0.u64 = v0;
|
||||||
|
return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
|
||||||
|
word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
|
||||||
|
on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
|
||||||
|
assertions in ut_M128.cpp and ut_carray.cpp when we use the
|
||||||
|
_mm_cvtsi128_si64 intrinsic. (See
|
||||||
|
https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
|
||||||
|
On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
|
||||||
|
Finally, even if the intrinsic exists, it may be spelled with or
|
||||||
|
without the 'x'.
|
||||||
|
*/
|
||||||
|
#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
|
||||||
|
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||||
|
union{
|
||||||
|
uint64_t u64[2];
|
||||||
|
__m128i m;
|
||||||
|
}u;
|
||||||
|
_mm_store_si128(&u.m, si);
|
||||||
|
return u.u64[0];
|
||||||
|
}
|
||||||
|
#elif defined(__llvm__) || defined(__ICC)
|
||||||
|
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||||
|
return (uint64_t)_mm_cvtsi128_si64(si);
|
||||||
|
}
|
||||||
|
#else /* GNUC, others */
|
||||||
|
/* FWIW, gcc's emmintrin.h has had the 'x' spelling
|
||||||
|
since at least gcc-3.4.4. The no-'x' spelling showed up
|
||||||
|
around 4.2. */
|
||||||
|
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||||
|
return (uint64_t)_mm_cvtsi128_si64x(si);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(__GNUC__) && __GNUC__ < 4
|
||||||
|
/* the cast builtins showed up in gcc4. */
|
||||||
|
R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
|
||||||
|
return (__m128)si;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
|
||||||
|
struct r123m128i{
|
||||||
|
__m128i m;
|
||||||
|
#if R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||||
|
// C++98 forbids a union member from having *any* constructors.
|
||||||
|
// C++11 relaxes this, and allows union members to have constructors
|
||||||
|
// as long as there is a "trivial" default construtor. So in C++11
|
||||||
|
// we can provide a r123m128i constructor with an __m128i argument, and still
|
||||||
|
// have the default (and hence trivial) default constructor.
|
||||||
|
r123m128i() = default;
|
||||||
|
r123m128i(__m128i _m): m(_m){}
|
||||||
|
#endif
|
||||||
|
r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
|
||||||
|
r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
|
||||||
|
#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||||
|
// With C++11 we can attach explicit to the bool conversion operator
|
||||||
|
// to disambiguate undesired promotions. For g++, this works
|
||||||
|
// only in 4.5 and above.
|
||||||
|
explicit operator bool() const {return _bool();}
|
||||||
|
#else
|
||||||
|
// Pre-C++11, we have to do something else. Google for the "safe bool"
|
||||||
|
// idiom for other ideas...
|
||||||
|
operator const void*() const{return _bool()?this:0;}
|
||||||
|
#endif
|
||||||
|
operator __m128i() const {return m;}
|
||||||
|
|
||||||
|
private:
|
||||||
|
#if R123_USE_SSE4_1
|
||||||
|
bool _bool() const{ return !_mm_testz_si128(m,m); }
|
||||||
|
#else
|
||||||
|
bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
|
||||||
|
__m128i& c = v.m;
|
||||||
|
__m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
|
||||||
|
c = _mm_add_epi64(c, zeroone);
|
||||||
|
//return c;
|
||||||
|
#if R123_USE_SSE4_1
|
||||||
|
__m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
|
||||||
|
if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
|
||||||
|
__m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
|
||||||
|
c = _mm_add_epi64(c, onezero);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
|
||||||
|
// The low two bits of mask are 11 iff the low 64 bits of
|
||||||
|
// c are zero.
|
||||||
|
if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
|
||||||
|
__m128i onezero = _mm_set_epi64x(1,0);
|
||||||
|
c = _mm_add_epi64(c, onezero);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
|
||||||
|
__m128i c = lhs.m;
|
||||||
|
__m128i incr128 = _mm_set_epi64x(0, n);
|
||||||
|
c = _mm_add_epi64(c, incr128);
|
||||||
|
// return c; // NO CARRY!
|
||||||
|
|
||||||
|
int64_t lo64 = _mm_extract_lo64(c);
|
||||||
|
if((uint64_t)lo64 < n)
|
||||||
|
c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
|
||||||
|
lhs.m = c;
|
||||||
|
return lhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We need this one because it's present, but never used in r123array1xm128i::incr
|
||||||
|
R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
|
||||||
|
throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
|
||||||
|
|
||||||
|
// The comparisons aren't implemented, but if we leave them out, and
|
||||||
|
// somebody writes, e.g., M1 < M2, the compiler will do an implicit
|
||||||
|
// conversion through void*. Sigh...
|
||||||
|
R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
|
||||||
|
throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
|
||||||
|
R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
|
||||||
|
throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
|
||||||
|
R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
|
||||||
|
throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
|
||||||
|
R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
|
||||||
|
throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
|
||||||
|
|
||||||
|
R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
|
||||||
|
return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
|
||||||
|
R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
|
||||||
|
return !(lhs==rhs);}
|
||||||
|
R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
|
||||||
|
r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
|
||||||
|
R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
|
||||||
|
return !(lhs==rhs);}
|
||||||
|
R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
|
||||||
|
union{
|
||||||
|
uint64_t u64[2];
|
||||||
|
__m128i m;
|
||||||
|
}u;
|
||||||
|
_mm_storeu_si128(&u.m, m.m);
|
||||||
|
return os << u.u64[0] << " " << u.u64[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
|
||||||
|
uint64_t u64[2];
|
||||||
|
is >> u64[0] >> u64[1];
|
||||||
|
m.m = _mm_set_epi64x(u64[1], u64[0]);
|
||||||
|
return is;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
|
||||||
|
r123m128i ret;
|
||||||
|
ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m128i m;
|
||||||
|
} r123m128i;
|
||||||
|
|
||||||
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
|
#else /* !R123_USE_SSE */
|
||||||
|
R123_STATIC_INLINE int haveAESNI(){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /* R123_USE_SSE */
|
||||||
|
|
||||||
|
#endif /* _Random123_sse_dot_h__ */
|
172
external/panphasia_ho/features/sunprofeatures.h
vendored
Normal file
172
external/panphasia_ho/features/sunprofeatures.h
vendored
Normal file
|
@ -0,0 +1,172 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef __sunprofeatures_dot_hpp
|
||||||
|
#define __sunprofeatures_dot_hpp
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// The basic idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #if some condition
|
||||||
|
// #define R123_SOMETHING 1
|
||||||
|
// #else
|
||||||
|
// #define R123_SOMETHING 0
|
||||||
|
// #endif
|
||||||
|
// #endif
|
||||||
|
// This idiom allows an external user to override any decision
|
||||||
|
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||||
|
|
||||||
|
// An alternative idiom is:
|
||||||
|
// #ifndef R123_SOMETHING
|
||||||
|
// #define R123_SOMETHING (some boolean expression)
|
||||||
|
// #endif
|
||||||
|
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||||
|
// pp-symbols.
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#define R123_USE_ASM_GNU 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
#define R123_USE_EMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#define R123_USE_INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO16_ASM
|
||||||
|
#define R123_USE_MULHILO16_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#define R123_USE_MULHILO64_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_PHILOX_64BIT
|
||||||
|
#define R123_USE_PHILOX_64BIT 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
// and in ../ut_features.cpp
|
||||||
|
#endif
|
210
external/panphasia_ho/features/xlcfeatures.h
vendored
Normal file
210
external/panphasia_ho/features/xlcfeatures.h
vendored
Normal file
|
@ -0,0 +1,210 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
Copyright (c) 2013, Los Alamos National Security, LLC
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Copyright 2013. Los Alamos National Security, LLC. This software was produced
|
||||||
|
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
|
||||||
|
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
|
||||||
|
the U.S. Department of Energy. The U.S. Government has rights to use,
|
||||||
|
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
|
||||||
|
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
|
||||||
|
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
|
||||||
|
to produce derivative works, such modified software should be clearly marked,
|
||||||
|
so as not to confuse it with the version available from LANL.
|
||||||
|
*/
|
||||||
|
#ifndef __xlcfeatures_dot_hpp
|
||||||
|
#define __xlcfeatures_dot_hpp
|
||||||
|
|
||||||
|
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
|
||||||
|
# error "This code has only been tested on x86 and PowerPC platforms."
|
||||||
|
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||||
|
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||||
|
/* Feel free to try the Random123 library on other architectures by changing
|
||||||
|
the conditions that reach this error, but you should consider it a
|
||||||
|
porting exercise and expect to encounter bugs and deficiencies.
|
||||||
|
Please let the authors know of any successes (or failures). */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
/* builtins are automatically available to xlc. To use them with xlc++,
|
||||||
|
one must include builtins.h. c.f
|
||||||
|
http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
|
||||||
|
*/
|
||||||
|
#include <builtins.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_STATIC_INLINE
|
||||||
|
#define R123_STATIC_INLINE static inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_FORCE_INLINE
|
||||||
|
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_CUDA_DEVICE
|
||||||
|
#define R123_CUDA_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_ASSERT
|
||||||
|
#include <assert.h>
|
||||||
|
#define R123_ASSERT(x) assert(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_BUILTIN_EXPECT
|
||||||
|
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_NI
|
||||||
|
#define R123_USE_AES_NI 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_2
|
||||||
|
#define R123_USE_SSE4_2 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE4_1
|
||||||
|
#define R123_USE_SSE4_1 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SSE
|
||||||
|
#define R123_USE_SSE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_AES_OPENSSL
|
||||||
|
/* There isn't really a good way to tell at compile time whether
|
||||||
|
openssl is available. Without a pre-compilation configure-like
|
||||||
|
tool, it's less error-prone to guess that it isn't available. Add
|
||||||
|
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||||
|
play with openssl */
|
||||||
|
#define R123_USE_AES_OPENSSL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_GNU_UINT128
|
||||||
|
#define R123_USE_GNU_UINT128 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_ASM_GNU
|
||||||
|
#define R123_USE_ASM_GNU 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_CPUID_MSVC
|
||||||
|
#define R123_USE_CPUID_MSVC 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_X86INTRIN_H
|
||||||
|
#define R123_USE_X86INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_IA32INTRIN_H
|
||||||
|
#define R123_USE_IA32INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_XMMINTRIN_H
|
||||||
|
#define R123_USE_XMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_EMMINTRIN_H
|
||||||
|
#define R123_USE_EMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_SMMINTRIN_H
|
||||||
|
#define R123_USE_SMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_WMMINTRIN_H
|
||||||
|
#define R123_USE_WMMINTRIN_H 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_INTRIN_H
|
||||||
|
#ifdef __ABM__
|
||||||
|
#define R123_USE_INTRIN_H 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_INTRIN_H 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_ASM
|
||||||
|
#define R123_USE_MULHILO32_ASM 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||||
|
#if (defined(__powerpc64__))
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 1
|
||||||
|
#else
|
||||||
|
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_MULHILO64_MULHI_INTRIN
|
||||||
|
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_MULHILO32_MULHI_INTRIN
|
||||||
|
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_ASM
|
||||||
|
#if defined(__powerpc64__)
|
||||||
|
#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
|
||||||
|
#else
|
||||||
|
#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||||
|
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __STDC_CONSTANT_MACROS
|
||||||
|
#define __STDC_CONSTANT_MACROS
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifndef UINT64_C
|
||||||
|
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||||
|
and in ../ut_features.cpp */
|
||||||
|
#endif
|
1882
external/panphasia_ho/high_order_panphasia_routines.c
vendored
Normal file
1882
external/panphasia_ho/high_order_panphasia_routines.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
147
external/panphasia_ho/main.c
vendored
Normal file
147
external/panphasia_ho/main.c
vendored
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <complex.h>
|
||||||
|
#include <fftw3-mpi.h>
|
||||||
|
|
||||||
|
#include "PAN_FFTW3.h"
|
||||||
|
#include "panphasia_functions.h"
|
||||||
|
|
||||||
|
extern size_t descriptor_base_size;
|
||||||
|
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
int threads_ok;
|
||||||
|
int number_omp_threads = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if 0 // this is now unused since all this has been migrated to plugin!
|
||||||
|
// does the same as the main below, but does not initialise MPI or FFTW (this should be done in MONOFONIC)
|
||||||
|
int PANPHASIA_HO_main(const char *descriptor, size_t *ngrid_load)
|
||||||
|
{
|
||||||
|
int verbose = 0;
|
||||||
|
int error;
|
||||||
|
size_t x0 = 0, y0 = 0, z0 = 0;
|
||||||
|
size_t rel_level;
|
||||||
|
int fdim=1; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
|
||||||
|
|
||||||
|
//char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
|
||||||
|
|
||||||
|
PANPHASIA_init_descriptor_(descriptor, &verbose);
|
||||||
|
|
||||||
|
printf("Descriptor %s\n ngrid_load %llu\n",descriptor,*ngrid_load);
|
||||||
|
|
||||||
|
// Choose smallest value of level to equal of exceed *ngrid_load)
|
||||||
|
|
||||||
|
for (rel_level=0; fdim*(descriptor_base_size<<(rel_level+1))<=*ngrid_load; rel_level++);
|
||||||
|
|
||||||
|
printf("Setting relative level = %llu\n",rel_level);
|
||||||
|
|
||||||
|
|
||||||
|
if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
|
||||||
|
{
|
||||||
|
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
|
||||||
|
abort();
|
||||||
|
};
|
||||||
|
|
||||||
|
//======================= FFTW ==============================
|
||||||
|
|
||||||
|
ptrdiff_t alloc_local, local_n0, local_0_start;
|
||||||
|
|
||||||
|
ptrdiff_t N0 = fdim*(descriptor_base_size << rel_level);
|
||||||
|
|
||||||
|
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 +2 , MPI_COMM_WORLD, &local_n0, &local_0_start);
|
||||||
|
|
||||||
|
FFTW_COMPLEX *Panphasia_White_Noise_Field;
|
||||||
|
|
||||||
|
Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
|
||||||
|
|
||||||
|
if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
|
||||||
|
{
|
||||||
|
printf("Error code from PANPHASIA_compute ... %d\n", error);
|
||||||
|
};
|
||||||
|
|
||||||
|
fftw_free(Panphasia_White_Noise_Field);
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef STANDALONE_PANPHASIA_HO
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
|
||||||
|
int verbose = 0;
|
||||||
|
int error;
|
||||||
|
size_t x0 = 0, y0 = 0, z0 = 0;
|
||||||
|
size_t rel_level;
|
||||||
|
char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
|
||||||
|
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
omp_set_num_threads(number_omp_threads);
|
||||||
|
int provided;
|
||||||
|
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
|
||||||
|
threads_ok = provided >= MPI_THREAD_FUNNELED;
|
||||||
|
if (threads_ok)
|
||||||
|
threads_ok = fftw_init_threads();
|
||||||
|
fftw_mpi_init();
|
||||||
|
int num_threads = number_omp_threads;
|
||||||
|
if (threads_ok)
|
||||||
|
{
|
||||||
|
fftw_plan_with_nthreads(num_threads);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("Failure to initialise threads ...\n");
|
||||||
|
MPI_Finalize();
|
||||||
|
};
|
||||||
|
|
||||||
|
printf("OpenMP threads enabled with FFTW. Number of threads %d\n", fftw_planner_nthreads());
|
||||||
|
#else
|
||||||
|
MPI_Init(&argc, &argv);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PANPHASIA_init_descriptor_(descriptor, &verbose);
|
||||||
|
|
||||||
|
rel_level = 6; //Set size of test dataset
|
||||||
|
|
||||||
|
if (error = PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose))
|
||||||
|
{
|
||||||
|
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
|
||||||
|
abort();
|
||||||
|
};
|
||||||
|
|
||||||
|
//======================= FFTW ==============================
|
||||||
|
|
||||||
|
fftw_mpi_init();
|
||||||
|
|
||||||
|
ptrdiff_t alloc_local, local_n0, local_0_start;
|
||||||
|
|
||||||
|
ptrdiff_t N0 = descriptor_base_size << rel_level;
|
||||||
|
|
||||||
|
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0+2, MPI_COMM_WORLD, &local_n0, &local_0_start);
|
||||||
|
|
||||||
|
FFTW_COMPLEX *Panphasia_White_Noise_Field;
|
||||||
|
|
||||||
|
Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
|
||||||
|
|
||||||
|
if (error = PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field))
|
||||||
|
{
|
||||||
|
printf("Error code from PANPHASIA_compute ... %d\n", error);
|
||||||
|
};
|
||||||
|
|
||||||
|
fftw_free(Panphasia_White_Noise_Field);
|
||||||
|
|
||||||
|
fftw_mpi_cleanup();
|
||||||
|
//==================== End FFTW ===========================
|
||||||
|
|
||||||
|
MPI_Finalize();
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif // STANDALONE_PANPHASIA_HO
|
22
external/panphasia_ho/makefile
vendored
Normal file
22
external/panphasia_ho/makefile
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
CC = mpicc -qopenmp
|
||||||
|
G99 = gcc -fopenmp
|
||||||
|
|
||||||
|
GSL_LIBS = -lgsl -lgslcblas
|
||||||
|
|
||||||
|
#CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi
|
||||||
|
CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi -DUSE_OPENMP
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pan_fftw3_test_code.x: main.o high_order_panphasia_routines.o pan_mpi_routines.o uniform_rand_threefry4x64.o
|
||||||
|
$(CC) $(CCFLAGS) *.o -o pan_fftw3_test_code.x
|
||||||
|
main.o: main.c
|
||||||
|
$(CC) $(CCFLAGS) -c main.c
|
||||||
|
high_order_panphasia_routines.o: high_order_panphasia_routines.c
|
||||||
|
$(CC) $(CCFLAGS) -c high_order_panphasia_routines.c
|
||||||
|
pan_mpi_routines.o: pan_mpi_routines.c
|
||||||
|
$(CC) $(CCFLAGS) -c pan_mpi_routines.c
|
||||||
|
uniform_rand_threefry4x64.o: uniform_rand_threefry4x64.c
|
||||||
|
$(CC) $(CCFLAGS) -c uniform_rand_threefry4x64.c
|
||||||
|
clean:
|
||||||
|
rm *.o
|
1737
external/panphasia_ho/pan_matrices_order6.h
vendored
Normal file
1737
external/panphasia_ho/pan_matrices_order6.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
533
external/panphasia_ho/pan_mpi_routines.c
vendored
Normal file
533
external/panphasia_ho/pan_mpi_routines.c
vendored
Normal file
|
@ -0,0 +1,533 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <complex.h>
|
||||||
|
#include <fftw3-mpi.h>
|
||||||
|
|
||||||
|
#include "PAN_FFTW3.h"
|
||||||
|
#include "panphasia_functions.h"
|
||||||
|
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern const int Nbasis;
|
||||||
|
extern const int irank_p[3][84];
|
||||||
|
|
||||||
|
extern size_t descriptor_order;
|
||||||
|
extern size_t descriptor_kk_limit;
|
||||||
|
extern size_t descriptor_base_size;
|
||||||
|
|
||||||
|
int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_fourier_grid,
|
||||||
|
ptrdiff_t local_n0_fourier_return, ptrdiff_t local_0_start_fourier_return,
|
||||||
|
FFTW_COMPLEX *return_field)
|
||||||
|
{
|
||||||
|
|
||||||
|
size_t copy_list[Nbasis];
|
||||||
|
|
||||||
|
int pmax = 6;
|
||||||
|
|
||||||
|
int nsubdivide = 21; //(pmax%2==0)?pmax+1:pmax+2;
|
||||||
|
|
||||||
|
size_t ncopy = (pmax + 1) * (pmax + 2) * (pmax + 3) / 6;
|
||||||
|
|
||||||
|
if (ncopy % nsubdivide != 0)
|
||||||
|
return (100010);
|
||||||
|
int nchunk = ncopy / nsubdivide;
|
||||||
|
|
||||||
|
int verbose = 1;
|
||||||
|
int flag_output_mode = 2;
|
||||||
|
int error;
|
||||||
|
ptrdiff_t size_to_alloc_fourier;
|
||||||
|
ptrdiff_t size_to_alloc_pan;
|
||||||
|
ptrdiff_t local_fourier_x_start, local_fourier_x_end;
|
||||||
|
FFTW_PLAN output_coeff_forward_plan;
|
||||||
|
|
||||||
|
ptrdiff_t N0_pan_grid = descriptor_base_size << relative_level;
|
||||||
|
|
||||||
|
if (N0_fourier_grid % N0_pan_grid != 0)
|
||||||
|
return (100015);
|
||||||
|
|
||||||
|
int fdim = N0_fourier_grid / N0_pan_grid;
|
||||||
|
size_t nfft_dim = N0_fourier_grid;
|
||||||
|
size_t npan_dim = N0_pan_grid;
|
||||||
|
|
||||||
|
int SHARED_FOUR_PAN_SPACE = (nsubdivide == 1) && (fdim == 1) && (sizeof(PAN_REAL) == sizeof(FFTW_REAL));
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
if (pmax > descriptor_order)
|
||||||
|
return (100020);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < Nbasis; i++)
|
||||||
|
copy_list[i] = i;
|
||||||
|
|
||||||
|
//printf("Dimensions of FT (%td,%td,%td)\n",N0_fourier_grid,N0_fourier_grid,N0_fourier_grid);
|
||||||
|
//printf("Dimensions of PG (%td,%td,%td)\n",N0_pan_grid,N0_pan_grid,N0_pan_grid);
|
||||||
|
//printf("local_no %td local_0_start_fourier %td\n",local_n0_fourier_return, local_0_start_fourier_return);
|
||||||
|
|
||||||
|
// Compute 1-D Spherical Bessel coefficients for each order //////////////////
|
||||||
|
// These are needed for the convolutions below //////////////////
|
||||||
|
size_t n4dimen;
|
||||||
|
|
||||||
|
n4dimen = (nfft_dim % 4 == 0) ? 4 * (nfft_dim / 4) + 4 : 4 * (nfft_dim / 4) + 5;
|
||||||
|
|
||||||
|
double complex *sph_bessel_coeff = FFTW_MALLOC(sizeof(double complex) * n4dimen * (pmax + 1));
|
||||||
|
|
||||||
|
if (sph_bessel_coeff == NULL)
|
||||||
|
return (100030);
|
||||||
|
|
||||||
|
compute_sph_bessel_coeffs(nfft_dim, pmax, n4dimen, fdim, sph_bessel_coeff);
|
||||||
|
|
||||||
|
//printf("Reached here! ndimen4 %ld\n",n4dimen);
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Determine sizes of Fourier and Panphasia coefficient 3-D arrays //
|
||||||
|
|
||||||
|
ptrdiff_t local_n0_pan;
|
||||||
|
ptrdiff_t local_0_start_pan;
|
||||||
|
{
|
||||||
|
|
||||||
|
int rank = 3;
|
||||||
|
|
||||||
|
ptrdiff_t local_n0_fourier;
|
||||||
|
ptrdiff_t local_0_start_fourier;
|
||||||
|
const ptrdiff_t ndimens_alloc_fourier[] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid + 2}; // Allocated for r2c
|
||||||
|
ptrdiff_t howmany = nchunk;
|
||||||
|
size_to_alloc_fourier = FFTW_MPI_LOCAL_SIZE_MANY(rank, ndimens_alloc_fourier, howmany,
|
||||||
|
FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD,
|
||||||
|
&local_n0_fourier, &local_0_start_fourier);
|
||||||
|
|
||||||
|
if (local_0_start_fourier != local_0_start_fourier_return)
|
||||||
|
{
|
||||||
|
printf("Error local_0_start_fourier!=local_0_start_fourier_return\n");
|
||||||
|
return (100032);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (local_n0_fourier != local_n0_fourier_return)
|
||||||
|
{
|
||||||
|
printf("Error local_n0_fourier!=local_n0_fourier_return\n");
|
||||||
|
return (100033);
|
||||||
|
};
|
||||||
|
|
||||||
|
ptrdiff_t abs_fourier_x_start, abs_fourier_x_end;
|
||||||
|
|
||||||
|
if (local_0_start_fourier_return % fdim == 0)
|
||||||
|
{
|
||||||
|
abs_fourier_x_start = local_0_start_fourier_return;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
abs_fourier_x_start = local_0_start_fourier_return + fdim - (local_0_start_fourier_return % fdim);
|
||||||
|
};
|
||||||
|
|
||||||
|
if ((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim == 0)
|
||||||
|
{
|
||||||
|
abs_fourier_x_end = local_0_start_fourier_return + local_n0_fourier_return - 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
abs_fourier_x_end = (local_0_start_fourier_return + local_n0_fourier_return - 1) -
|
||||||
|
((local_0_start_fourier_return + local_n0_fourier_return - 1) % fdim);
|
||||||
|
};
|
||||||
|
|
||||||
|
if ((abs_fourier_x_end - abs_fourier_x_start) % fdim != 0)
|
||||||
|
return (100036);
|
||||||
|
|
||||||
|
local_0_start_pan = abs_fourier_x_start / fdim;
|
||||||
|
local_n0_pan = 1 + (abs_fourier_x_end - abs_fourier_x_start) / fdim;
|
||||||
|
|
||||||
|
local_fourier_x_start = abs_fourier_x_start - local_0_start_fourier_return;
|
||||||
|
local_fourier_x_end = abs_fourier_x_end - local_0_start_fourier_return;
|
||||||
|
|
||||||
|
// const ptrdiff_t ndimens_alloc_pan[] = {N0_pan_grid, N0_pan_grid, N0_pan_grid+2};
|
||||||
|
howmany = ncopy;
|
||||||
|
|
||||||
|
size_to_alloc_pan = howmany * local_n0_pan * N0_pan_grid * (N0_pan_grid + 2);
|
||||||
|
|
||||||
|
//printf("size_to_alloc_fourier = %td\n",size_to_alloc_fourier);
|
||||||
|
//printf("size_to_alloc_pan = %td\n",size_to_alloc_pan);
|
||||||
|
};
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Allocate arrays to store Panphasia coefficients and Fourier information
|
||||||
|
// If nsubdivide==1 then use the same structure to store both.
|
||||||
|
|
||||||
|
void *panphasia_coefficients = FFTW_MALLOC(sizeof(PAN_REAL) * size_to_alloc_pan);
|
||||||
|
void *fourier_grids;
|
||||||
|
|
||||||
|
if (panphasia_coefficients == NULL)
|
||||||
|
return (100040);
|
||||||
|
|
||||||
|
void *mode_weightings = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
|
||||||
|
|
||||||
|
FFTW_REAL *ptr_mode_weightings;
|
||||||
|
ptr_mode_weightings = mode_weightings;
|
||||||
|
memset(ptr_mode_weightings, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier / nchunk);
|
||||||
|
|
||||||
|
FFTW_REAL *ptr_real_fourier_grid;
|
||||||
|
FFTW_REAL *ptr_panphasia_coefficients = panphasia_coefficients;
|
||||||
|
|
||||||
|
FFTW_COMPLEX *ptr_cmplx_fourier_grid;
|
||||||
|
|
||||||
|
if (SHARED_FOUR_PAN_SPACE)
|
||||||
|
{
|
||||||
|
ptr_real_fourier_grid = panphasia_coefficients;
|
||||||
|
ptr_cmplx_fourier_grid = panphasia_coefficients;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
|
||||||
|
fourier_grids = FFTW_MALLOC(sizeof(FFTW_REAL) * size_to_alloc_fourier);
|
||||||
|
|
||||||
|
if (fourier_grids == NULL)
|
||||||
|
return (100041);
|
||||||
|
|
||||||
|
ptr_real_fourier_grid = fourier_grids;
|
||||||
|
ptr_cmplx_fourier_grid = fourier_grids;
|
||||||
|
};
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////// Compute the Panphasia coefficients ////////////////////////////////
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
size_t xorigin = local_0_start_pan, yorigin = 0, zorigin = 0;
|
||||||
|
|
||||||
|
size_t xextent = local_n0_pan, yextent = N0_pan_grid, zextent = N0_pan_grid;
|
||||||
|
|
||||||
|
if ((error = PANPHASIA_compute_coefficients_(&xorigin, &yorigin, &zorigin, &xextent, &yextent,
|
||||||
|
&zextent, copy_list, &ncopy, panphasia_coefficients,
|
||||||
|
&flag_output_mode, &verbose)))
|
||||||
|
return (100100 + error);
|
||||||
|
};
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////// Output diagnostics for small grids only
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
if (N0_pan_grid < -1)
|
||||||
|
{
|
||||||
|
|
||||||
|
int rank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
char filename[100];
|
||||||
|
sprintf(filename, "output_real_space_field.%d", rank);
|
||||||
|
|
||||||
|
FILE *fp;
|
||||||
|
printf("local_n0_pan %ld npan_dim %ld N0_pan_grid %ld\n", local_n0_pan,
|
||||||
|
npan_dim, N0_pan_grid);
|
||||||
|
|
||||||
|
fp = fopen(filename, "w");
|
||||||
|
|
||||||
|
for (int ix = 0; ix < local_n0_pan; ix++)
|
||||||
|
for (int iy = 0; iy < npan_dim; iy++)
|
||||||
|
for (int iz = 0; iz < npan_dim; iz++)
|
||||||
|
{
|
||||||
|
int index = ix * N0_pan_grid * (N0_pan_grid + 2) + iy * (N0_pan_grid + 2) + iz;
|
||||||
|
fprintf(fp, "%6lu%6d%6d %14.8lf %d\n", ix + local_0_start_pan, iy, iz,
|
||||||
|
ptr_panphasia_coefficients[index], index);
|
||||||
|
};
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
//----------------------------------------------------------------------------------
|
||||||
|
////////////// Set up FTTW plan //////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// printf("Making the plan ... \n");
|
||||||
|
|
||||||
|
//////////////////// Make plan for ncopy interleaved FTs ///////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
{
|
||||||
|
int rank = 3;
|
||||||
|
const ptrdiff_t ndimens[3] = {N0_fourier_grid, N0_fourier_grid, N0_fourier_grid};
|
||||||
|
ptrdiff_t howmany = nchunk;
|
||||||
|
ptrdiff_t block = FFTW_MPI_DEFAULT_BLOCK;
|
||||||
|
ptrdiff_t tblock = FFTW_MPI_DEFAULT_BLOCK;
|
||||||
|
unsigned flags = FFTW_ESTIMATE;
|
||||||
|
|
||||||
|
output_coeff_forward_plan = FFTW_MPI_PLAN_MANY_DTF_R2C(rank, ndimens,
|
||||||
|
howmany, block, tblock,
|
||||||
|
ptr_real_fourier_grid, ptr_cmplx_fourier_grid,
|
||||||
|
MPI_COMM_WORLD, flags);
|
||||||
|
if (output_coeff_forward_plan == NULL)
|
||||||
|
{
|
||||||
|
printf("Null plan\n");
|
||||||
|
return (100051);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
//printf("Plan completed ... \n");
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
//----------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
memset(return_field, 0, local_n0_fourier_return * N0_fourier_grid * (N0_fourier_grid + 2) * sizeof(FFTW_REAL));
|
||||||
|
|
||||||
|
for (int iter = 0; iter < nsubdivide; iter++)
|
||||||
|
{
|
||||||
|
|
||||||
|
int moffset = iter * nchunk;
|
||||||
|
|
||||||
|
if (!SHARED_FOUR_PAN_SPACE)
|
||||||
|
{
|
||||||
|
|
||||||
|
memset(ptr_real_fourier_grid, 0, sizeof(FFTW_REAL) * size_to_alloc_fourier);
|
||||||
|
|
||||||
|
// Copy Panphasia coefficients to Fourier grid with appropriate stride - fdim
|
||||||
|
|
||||||
|
size_t index_p, index_f;
|
||||||
|
int m;
|
||||||
|
|
||||||
|
for (int ix_f = local_fourier_x_start, ix_p = 0; ix_f <= local_fourier_x_end; ix_f += fdim, ix_p++)
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#pragma omp parallel for collapse(2) private(index_p, index_f, m)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (int iy_f = 0, iy_p = 0; iy_f < nfft_dim; iy_f += fdim, iy_p++)
|
||||||
|
for (int iz_f = 0, iz_p = 0; iz_f < nfft_dim; iz_f += fdim, iz_p++)
|
||||||
|
{
|
||||||
|
index_p = ix_p * N0_pan_grid * (N0_pan_grid + 2) + iy_p * (N0_pan_grid + 2) + iz_p;
|
||||||
|
index_f = ix_f * N0_fourier_grid * (N0_fourier_grid + 2) + iy_f * (N0_fourier_grid + 2) + iz_f;
|
||||||
|
|
||||||
|
for (m = 0; m < nchunk; m++)
|
||||||
|
{
|
||||||
|
ptr_real_fourier_grid[nchunk * index_f + m] =
|
||||||
|
ptr_panphasia_coefficients[ncopy * index_p + moffset + m];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
FFTW_MPI_EXECUTE_DFT_R2C(output_coeff_forward_plan,
|
||||||
|
ptr_real_fourier_grid, ptr_cmplx_fourier_grid);
|
||||||
|
|
||||||
|
{
|
||||||
|
// Convolve and combine the FT of the Panphasia coefficient field
|
||||||
|
|
||||||
|
size_t index1, index2;
|
||||||
|
complex weight;
|
||||||
|
//int m;
|
||||||
|
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#pragma omp parallel for collapse(3) private(index1, index2, weight, m)
|
||||||
|
#endif
|
||||||
|
for (int ix = 0; ix < local_n0_fourier_return; ix++)
|
||||||
|
for (int iy = 0; iy < nfft_dim; iy++)
|
||||||
|
for (int iz = 0; iz <= nfft_dim / 2; iz++)
|
||||||
|
{
|
||||||
|
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
|
||||||
|
for (int m = 0; m < nchunk; m++)
|
||||||
|
{
|
||||||
|
index2 = nchunk * index1 + m;
|
||||||
|
|
||||||
|
weight = sph_bessel_coeff[n4dimen * irank_p[0][m + moffset] + ix + local_0_start_fourier_return] *
|
||||||
|
sph_bessel_coeff[n4dimen * irank_p[1][m + moffset] + iy] *
|
||||||
|
sph_bessel_coeff[n4dimen * irank_p[2][m + moffset] + iz];
|
||||||
|
return_field[index1] += weight * ptr_cmplx_fourier_grid[index2];
|
||||||
|
ptr_mode_weightings[index1] += cabs(weight) * cabs(weight);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
}; // End loop over iter
|
||||||
|
|
||||||
|
// printf("Reached here 10!\n");
|
||||||
|
|
||||||
|
//Add phase shift and normalise field
|
||||||
|
{
|
||||||
|
|
||||||
|
double complex phase_shift_and_scale;
|
||||||
|
int kx, ky, kz;
|
||||||
|
const double pi = 4.0 * atan(1.0);
|
||||||
|
size_t index1;
|
||||||
|
|
||||||
|
double min_weight = 100.0;
|
||||||
|
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, phase_shift_and_scale)
|
||||||
|
#endif
|
||||||
|
for (int ix = 0; ix < local_n0_fourier_return; ix++)
|
||||||
|
for (int iy = 0; iy < nfft_dim; iy++)
|
||||||
|
for (int iz = 0; iz <= nfft_dim / 2; iz++)
|
||||||
|
{
|
||||||
|
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
|
||||||
|
kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
|
||||||
|
ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
|
||||||
|
kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
|
||||||
|
|
||||||
|
if ((kx == nfft_dim / 2) || (ky == nfft_dim / 2) || (kz == nfft_dim / 2))
|
||||||
|
{
|
||||||
|
// Set Nyquist modes to zero - not used by IC_Gen anyway.
|
||||||
|
phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift
|
||||||
|
ptr_mode_weightings[index1] = 0.0; // Set squared weight to zero
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
phase_shift_and_scale = sqrt((double)(fdim * fdim * fdim)) *
|
||||||
|
cexp((double)fdim * (-I) * pi * (double)(kx + ky + kz) / sqrt(ptr_mode_weightings[index1]) /
|
||||||
|
(double)nfft_dim) /
|
||||||
|
pow((double)nfft_dim, 1.5);
|
||||||
|
};
|
||||||
|
|
||||||
|
return_field[index1] *= phase_shift_and_scale;
|
||||||
|
if (ptr_mode_weightings[index1] < min_weight)
|
||||||
|
min_weight = ptr_mode_weightings[index1];
|
||||||
|
};
|
||||||
|
|
||||||
|
// printf("Minimum weight %lf\n",sqrt(min_weight));
|
||||||
|
};
|
||||||
|
|
||||||
|
// printf("Reached here 11!\n");
|
||||||
|
|
||||||
|
// Rescale selected Fourier modes to unit amplitude.
|
||||||
|
// By default this part is not executed.
|
||||||
|
|
||||||
|
if (descriptor_kk_limit > 0)
|
||||||
|
{
|
||||||
|
size_t index1;
|
||||||
|
complex weight;
|
||||||
|
size_t ksquared;
|
||||||
|
int kx, ky, kz;
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
#pragma omp parallel for collapse(3) private(index1, kx, ky, kz, ksquared, weight)
|
||||||
|
#endif
|
||||||
|
for (int ix = 0; ix < local_n0_fourier_return; ix++)
|
||||||
|
for (int iy = 0; iy < nfft_dim; iy++)
|
||||||
|
for (int iz = 0; iz <= nfft_dim / 2; iz++)
|
||||||
|
{
|
||||||
|
kx = (ix + local_0_start_fourier_return > nfft_dim / 2) ? ix + local_0_start_fourier_return - nfft_dim : ix + local_0_start_fourier_return;
|
||||||
|
ky = (iy > nfft_dim / 2) ? iy - nfft_dim : iy;
|
||||||
|
kz = (iz > nfft_dim / 2) ? iz - nfft_dim : iz;
|
||||||
|
ksquared = kx * kx + ky * ky + kz * kz;
|
||||||
|
if ((kx != nfft_dim / 2) && (ky != nfft_dim / 2) && (kz != nfft_dim / 2))
|
||||||
|
{ //Omit Nyquist modes
|
||||||
|
|
||||||
|
if ((ksquared <= descriptor_kk_limit) && (ksquared != 0))
|
||||||
|
{
|
||||||
|
index1 = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
|
||||||
|
weight = cabs(return_field[index1]);
|
||||||
|
return_field[index1] /= weight;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
//printf("Reached here 12!\n");
|
||||||
|
|
||||||
|
int rank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
char filename[100];
|
||||||
|
sprintf(filename, "output_k_space_alt.%d", rank);
|
||||||
|
FILE *fp;
|
||||||
|
|
||||||
|
if (nfft_dim < -1)
|
||||||
|
{
|
||||||
|
|
||||||
|
FILE *fp;
|
||||||
|
|
||||||
|
fp = fopen(filename, "w");
|
||||||
|
|
||||||
|
for (int ix = 0; ix < local_n0_fourier_return; ix++)
|
||||||
|
for (int iy = 0; iy < nfft_dim; iy++)
|
||||||
|
for (int iz = 0; iz <= nfft_dim / 2; iz++)
|
||||||
|
{
|
||||||
|
|
||||||
|
int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
|
||||||
|
fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
|
||||||
|
creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
|
||||||
|
// ptr_mode_weightings[index]);
|
||||||
|
};
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
|
||||||
|
fp = fopen(filename, "w");
|
||||||
|
|
||||||
|
for (int ix = 0; ix < local_n0_fourier_return; ix++)
|
||||||
|
for (int iy = 0; iy < nfft_dim; iy++)
|
||||||
|
for (int iz = 0; iz <= nfft_dim / 2; iz++)
|
||||||
|
{
|
||||||
|
if (ix + iy + iz + local_0_start_fourier_return < 100)
|
||||||
|
{
|
||||||
|
int index = ix * N0_fourier_grid * (N0_fourier_grid / 2 + 1) + iy * (N0_fourier_grid / 2 + 1) + iz;
|
||||||
|
fprintf(fp, "%6lu%6d%6d %14.8lf %14.8lf %14.8lf \n", ix + local_0_start_fourier_return, iy, iz,
|
||||||
|
creal(return_field[index]), cimag(return_field[index]), sqrt(ptr_mode_weightings[index]));
|
||||||
|
// ptr_mode_weightings[index]);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
fclose(fp);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Transpose output field
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
FFTW_PLAN transpose_output_plan;
|
||||||
|
unsigned flags = FFTW_ESTIMATE;
|
||||||
|
|
||||||
|
void *ptr_inter = return_field;
|
||||||
|
|
||||||
|
FFTW_REAL *ptr_return_as_real_field;
|
||||||
|
ptr_return_as_real_field = ptr_inter;
|
||||||
|
|
||||||
|
// int rank = 2;
|
||||||
|
|
||||||
|
// ptrdiff_t size_to_transpose;
|
||||||
|
ptrdiff_t howmany = N0_fourier_grid + 2;
|
||||||
|
// ptrdiff_t local_n0, local_0_start;
|
||||||
|
// ptrdiff_t local_n1, local_1_start;
|
||||||
|
|
||||||
|
// const ptrdiff_t ndimens[] = {N0_fourier_grid, N0_fourier_grid};
|
||||||
|
|
||||||
|
// size_to_transpose = FFTW_MPI_LOCAL_SIZE_MANY_TRANSPOSED(rank, ndimens, howmany,
|
||||||
|
// FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
||||||
|
// MPI_COMM_WORLD,&local_n0, &local_0_start,&local_n1, &local_1_start);
|
||||||
|
|
||||||
|
// printf("size_to_transpose = %td\n",size_to_transpose);
|
||||||
|
|
||||||
|
transpose_output_plan = FFTW_MPI_PLAN_MANY_TRANSPOSE(N0_fourier_grid, N0_fourier_grid,
|
||||||
|
howmany, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
||||||
|
ptr_return_as_real_field, ptr_return_as_real_field,
|
||||||
|
MPI_COMM_WORLD, flags);
|
||||||
|
|
||||||
|
//printf("Transpose plan completed.\n");
|
||||||
|
|
||||||
|
FFTW_MPI_EXECUTE_R2R(transpose_output_plan, ptr_return_as_real_field, ptr_return_as_real_field);
|
||||||
|
|
||||||
|
//printf("Transpose completed.\n");
|
||||||
|
};
|
||||||
|
|
||||||
|
// Free all memory assigned by FFTW_MALLOC
|
||||||
|
FFTW_FREE(mode_weightings);
|
||||||
|
FFTW_FREE(sph_bessel_coeff);
|
||||||
|
FFTW_FREE(panphasia_coefficients);
|
||||||
|
|
||||||
|
if (!SHARED_FOUR_PAN_SPACE)
|
||||||
|
FFTW_FREE(fourier_grids);
|
||||||
|
|
||||||
|
FFTW_DESTROY_PLAN(output_coeff_forward_plan);
|
||||||
|
|
||||||
|
//printf("Reached end of PANPHASIA_compute_kspace_field_\n");
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//==========================================================================================
|
||||||
|
//==========================================================================================
|
99
external/panphasia_ho/panphasia_functions.h
vendored
Normal file
99
external/panphasia_ho/panphasia_functions.h
vendored
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// By default Panphasia is computed at single
|
||||||
|
// precision. To override this define PAN_DOUBLE
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef USE_PRECISION_FLOAT
|
||||||
|
#define PAN_DOUBLE_PRECISION 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef PAN_DOUBLE_PRECISION
|
||||||
|
|
||||||
|
#define PAN_REAL float
|
||||||
|
#define PAN_COMPLEX float complex
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define PAN_REAL double
|
||||||
|
#define PAN_COMPLEX double complex
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#include "PAN_FFTW3.h"
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void return_uniform_pseudo_rands_threefry4x64_(size_t l,size_t j1,size_t j2,size_t j3,
|
||||||
|
PAN_REAL *panphasia_randoms, size_t seed_value,
|
||||||
|
size_t allow_non_zero_seed_safety_catch);
|
||||||
|
|
||||||
|
void box_muller_(PAN_REAL *unif_real,PAN_REAL *gvar);
|
||||||
|
|
||||||
|
void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_cell_vec, int control_flag);
|
||||||
|
|
||||||
|
void threefry4x64_test_(int verbose);
|
||||||
|
void inverse_threefry4x64_test_(int verbose);
|
||||||
|
void set_panphasia_key_(int verbose);
|
||||||
|
void check_panphasia_key_(int verbose);
|
||||||
|
|
||||||
|
void PANPHASIA_init_descriptor_checks();
|
||||||
|
|
||||||
|
void speed_test_();
|
||||||
|
void speed_test2_();
|
||||||
|
void check_randoms_();
|
||||||
|
void test_random_dist_(size_t shift);
|
||||||
|
void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3,
|
||||||
|
PAN_REAL *gauss_rand_parent, PAN_REAL *legendre_rand);
|
||||||
|
void return_root_legendre_coefficients_(PAN_REAL *root);
|
||||||
|
|
||||||
|
|
||||||
|
int parse_and_validate_descriptor_(const char *, int *);
|
||||||
|
int demo_descriptor_();
|
||||||
|
long long int compute_check_digit_();
|
||||||
|
int PANPHASIA_init_descriptor_(const char *descriptor, int *verbose);
|
||||||
|
int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose);
|
||||||
|
|
||||||
|
|
||||||
|
int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart,
|
||||||
|
size_t *xextent, size_t *yextent, size_t *zextend,
|
||||||
|
size_t *copy_list,
|
||||||
|
size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose);
|
||||||
|
|
||||||
|
void test_moments_();
|
||||||
|
void test_propogation_of_moments_(int iterations);
|
||||||
|
void test_cell_moments(const char*,size_t, size_t, size_t, size_t, size_t, double *);
|
||||||
|
|
||||||
|
void spherical_bessel_(int *, double *, double *);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs);
|
||||||
|
|
||||||
|
int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen,
|
||||||
|
size_t *cuboid_y_dimen,size_t *cuboid_z_dimen, size_t *cell_lev,
|
||||||
|
size_t *cell_x, size_t *cell_y, size_t *cell_z, size_t number_children,
|
||||||
|
size_t *child_cell_indices);
|
||||||
|
|
||||||
|
int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates,
|
||||||
|
size_t extent, size_t *return_tree_list_coordinates,
|
||||||
|
size_t nreturn,
|
||||||
|
long long int *child_pointer, size_t *level_count, size_t *level_begin, size_t *index_perm);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
void compute_sph_bessel_coeffs(int, int, int, int, std::complex<double>* *);
|
||||||
|
#else
|
||||||
|
void compute_sph_bessel_coeffs(int, int, int, int, double complex *);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *);
|
874
external/panphasia_ho/threefry.h
vendored
Normal file
874
external/panphasia_ho/threefry.h
vendored
Normal file
|
@ -0,0 +1,874 @@
|
||||||
|
/*
|
||||||
|
Copyright 2010-2011, D. E. Shaw Research.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of D. E. Shaw Research nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#ifndef _threefry_dot_h_
|
||||||
|
#define _threefry_dot_h_
|
||||||
|
#include "features/compilerfeatures.h"
|
||||||
|
#include "array.h"
|
||||||
|
|
||||||
|
/** \cond HIDDEN_FROM_DOXYGEN */
|
||||||
|
/* Significant parts of this file were copied from
|
||||||
|
from:
|
||||||
|
Skein_FinalRnd/ReferenceImplementation/skein.h
|
||||||
|
Skein_FinalRnd/ReferenceImplementation/skein_block.c
|
||||||
|
|
||||||
|
in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
|
||||||
|
|
||||||
|
This file has been modified so that it may no longer perform its originally
|
||||||
|
intended function. If you're looking for a Skein or Threefish source code,
|
||||||
|
please consult the original file.
|
||||||
|
|
||||||
|
The original file had the following header:
|
||||||
|
**************************************************************************
|
||||||
|
**
|
||||||
|
** Interface declarations and internal definitions for Skein hashing.
|
||||||
|
**
|
||||||
|
** Source code author: Doug Whiting, 2008.
|
||||||
|
**
|
||||||
|
** This algorithm and source code is released to the public domain.
|
||||||
|
**
|
||||||
|
***************************************************************************
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* See comment at the top of philox.h for the macro pre-process
|
||||||
|
strategy. */
|
||||||
|
|
||||||
|
/* Rotation constants: */
|
||||||
|
enum r123_enum_threefry64x4 {
|
||||||
|
/* These are the R_256 constants from the Threefish reference sources
|
||||||
|
with names changed to R_64x4... */
|
||||||
|
R_64x4_0_0=14, R_64x4_0_1=16,
|
||||||
|
R_64x4_1_0=52, R_64x4_1_1=57,
|
||||||
|
R_64x4_2_0=23, R_64x4_2_1=40,
|
||||||
|
R_64x4_3_0= 5, R_64x4_3_1=37,
|
||||||
|
R_64x4_4_0=25, R_64x4_4_1=33,
|
||||||
|
R_64x4_5_0=46, R_64x4_5_1=12,
|
||||||
|
R_64x4_6_0=58, R_64x4_6_1=22,
|
||||||
|
R_64x4_7_0=32, R_64x4_7_1=32
|
||||||
|
};
|
||||||
|
|
||||||
|
enum r123_enum_threefry64x2 {
|
||||||
|
/*
|
||||||
|
// Output from skein_rot_search: (srs64_B64-X1000)
|
||||||
|
// Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
|
||||||
|
// Start: Tue Mar 1 10:07:48 2011
|
||||||
|
// rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
|
||||||
|
*/
|
||||||
|
R_64x2_0_0=16,
|
||||||
|
R_64x2_1_0=42,
|
||||||
|
R_64x2_2_0=12,
|
||||||
|
R_64x2_3_0=31,
|
||||||
|
R_64x2_4_0=16,
|
||||||
|
R_64x2_5_0=32,
|
||||||
|
R_64x2_6_0=24,
|
||||||
|
R_64x2_7_0=21
|
||||||
|
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
|
||||||
|
// 5 rounds: minHW = 8 [ 8 8 8 8 ]
|
||||||
|
// 6 rounds: minHW = 16 [ 16 16 16 16 ]
|
||||||
|
// 7 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
// 8 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||||
|
// 9 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||||
|
//10 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||||
|
//11 rounds: minHW = 64 [ 64 64 64 64 ] */
|
||||||
|
};
|
||||||
|
|
||||||
|
enum r123_enum_threefry32x4 {
|
||||||
|
/* Output from skein_rot_search: (srs-B128-X5000.out)
|
||||||
|
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
|
||||||
|
// Start: Mon Aug 24 22:41:36 2009
|
||||||
|
// ...
|
||||||
|
// rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
|
||||||
|
R_32x4_0_0=10, R_32x4_0_1=26,
|
||||||
|
R_32x4_1_0=11, R_32x4_1_1=21,
|
||||||
|
R_32x4_2_0=13, R_32x4_2_1=27,
|
||||||
|
R_32x4_3_0=23, R_32x4_3_1= 5,
|
||||||
|
R_32x4_4_0= 6, R_32x4_4_1=20,
|
||||||
|
R_32x4_5_0=17, R_32x4_5_1=11,
|
||||||
|
R_32x4_6_0=25, R_32x4_6_1=10,
|
||||||
|
R_32x4_7_0=18, R_32x4_7_1=20
|
||||||
|
|
||||||
|
/* 4 rounds: minHW = 3 [ 3 3 3 3 ]
|
||||||
|
// 5 rounds: minHW = 7 [ 7 7 7 7 ]
|
||||||
|
// 6 rounds: minHW = 12 [ 13 12 13 12 ]
|
||||||
|
// 7 rounds: minHW = 22 [ 22 23 22 23 ]
|
||||||
|
// 8 rounds: minHW = 31 [ 31 31 31 31 ]
|
||||||
|
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
//10 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
enum r123_enum_threefry32x2 {
|
||||||
|
/* Output from skein_rot_search (srs32x2-X5000.out)
|
||||||
|
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
|
||||||
|
// Start: Tue Jul 12 11:11:33 2011
|
||||||
|
// rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
|
||||||
|
R_32x2_0_0=13,
|
||||||
|
R_32x2_1_0=15,
|
||||||
|
R_32x2_2_0=26,
|
||||||
|
R_32x2_3_0= 6,
|
||||||
|
R_32x2_4_0=17,
|
||||||
|
R_32x2_5_0=29,
|
||||||
|
R_32x2_6_0=16,
|
||||||
|
R_32x2_7_0=24
|
||||||
|
|
||||||
|
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
|
||||||
|
// 5 rounds: minHW = 6 [ 6 8 6 8 ]
|
||||||
|
// 6 rounds: minHW = 9 [ 9 12 9 12 ]
|
||||||
|
// 7 rounds: minHW = 16 [ 16 24 16 24 ]
|
||||||
|
// 8 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
//10 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||||
|
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
|
||||||
|
};
|
||||||
|
|
||||||
|
enum r123_enum_threefry_wcnt {
|
||||||
|
WCNT2=2,
|
||||||
|
WCNT4=4
|
||||||
|
};
|
||||||
|
|
||||||
|
#if R123_USE_64BIT
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
|
||||||
|
{
|
||||||
|
return (x << (N & 63)) | (x >> ((64-N) & 63));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
|
||||||
|
{
|
||||||
|
return (x << (N & 31)) | (x >> ((32-N) & 31));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
|
||||||
|
#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
|
||||||
|
#define SKEIN_KS_PARITY32 0x1BD11BDA
|
||||||
|
|
||||||
|
/** \endcond */
|
||||||
|
|
||||||
|
#ifndef THREEFRY2x32_DEFAULT_ROUNDS
|
||||||
|
#define THREEFRY2x32_DEFAULT_ROUNDS 20
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef THREEFRY2x64_DEFAULT_ROUNDS
|
||||||
|
#define THREEFRY2x64_DEFAULT_ROUNDS 20
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef THREEFRY4x32_DEFAULT_ROUNDS
|
||||||
|
#define THREEFRY4x32_DEFAULT_ROUNDS 20
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef THREEFRY4x64_DEFAULT_ROUNDS
|
||||||
|
#define THREEFRY4x64_DEFAULT_ROUNDS 20
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define _threefry2x_tpl(W) \
|
||||||
|
typedef struct r123array2x##W threefry2x##W##_ctr_t; \
|
||||||
|
typedef struct r123array2x##W threefry2x##W##_key_t; \
|
||||||
|
typedef struct r123array2x##W threefry2x##W##_ukey_t; \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||||
|
threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
|
||||||
|
uint##W##_t X0,X1; \
|
||||||
|
uint##W##_t ks0, ks1, ks2; \
|
||||||
|
R123_ASSERT(Nrounds<=32); \
|
||||||
|
ks2 = SKEIN_KS_PARITY##W; \
|
||||||
|
ks0 = k.v[0]; \
|
||||||
|
X0 = in.v[0] + ks0; \
|
||||||
|
ks2 ^= ks0; \
|
||||||
|
\
|
||||||
|
ks1 = k.v[1]; \
|
||||||
|
X1 = in.v[1] + ks1; \
|
||||||
|
ks2 ^= ks1; \
|
||||||
|
\
|
||||||
|
if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>3){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks1; X1 += ks2; \
|
||||||
|
X1 += 1; /* X.v[2-1] += r */ \
|
||||||
|
} \
|
||||||
|
if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>7){ \
|
||||||
|
/* InjectKey(r=2) */ \
|
||||||
|
X0 += ks2; X1 += ks0; \
|
||||||
|
X1 += 2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>11){ \
|
||||||
|
/* InjectKey(r=3) */ \
|
||||||
|
X0 += ks0; X1 += ks1; \
|
||||||
|
X1 += 3; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>15){ \
|
||||||
|
/* InjectKey(r=4) */ \
|
||||||
|
X0 += ks1; X1 += ks2; \
|
||||||
|
X1 += 4; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>19){ \
|
||||||
|
/* InjectKey(r=5) */ \
|
||||||
|
X0 += ks2; X1 += ks0; \
|
||||||
|
X1 += 5; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>23){ \
|
||||||
|
/* InjectKey(r=6) */ \
|
||||||
|
X0 += ks0; X1 += ks1; \
|
||||||
|
X1 += 6; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>27){ \
|
||||||
|
/* InjectKey(r=7) */ \
|
||||||
|
X0 += ks1; X1 += ks2; \
|
||||||
|
X1 += 7; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||||
|
if(Nrounds>31){ \
|
||||||
|
/* InjectKey(r=8) */ \
|
||||||
|
X0 += ks2; X1 += ks0; \
|
||||||
|
X1 += 8; \
|
||||||
|
} \
|
||||||
|
threefry2x##W##_ctr_t ret={{X0, X1}}; \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
/** @ingroup ThreefryNxW */ \
|
||||||
|
enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||||
|
threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
|
||||||
|
return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define _threefry4x_tpl(W) \
|
||||||
|
typedef struct r123array4x##W threefry4x##W##_ctr_t; \
|
||||||
|
typedef struct r123array4x##W threefry4x##W##_key_t; \
|
||||||
|
typedef struct r123array4x##W threefry4x##W##_ukey_t; \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||||
|
threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
|
||||||
|
uint##W##_t X0, X1, X2, X3; \
|
||||||
|
uint##W##_t ks0, ks1, ks2, ks3, ks4; \
|
||||||
|
R123_ASSERT(Nrounds<=72); \
|
||||||
|
ks4 = SKEIN_KS_PARITY##W; \
|
||||||
|
ks0 = k.v[0]; \
|
||||||
|
X0 = in.v[0] + ks0; \
|
||||||
|
ks4 ^= ks0; \
|
||||||
|
\
|
||||||
|
ks1 = k.v[1]; \
|
||||||
|
X1 = in.v[1] + ks1; \
|
||||||
|
ks4 ^= ks1; \
|
||||||
|
\
|
||||||
|
ks2 = k.v[2]; \
|
||||||
|
X2 = in.v[2] + ks2; \
|
||||||
|
ks4 ^= ks2; \
|
||||||
|
\
|
||||||
|
ks3 = k.v[3]; \
|
||||||
|
X3 = in.v[3] + ks3; \
|
||||||
|
ks4 ^= ks3; \
|
||||||
|
\
|
||||||
|
if(Nrounds>0){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>1){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>2){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>3){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>3){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||||
|
X3 += 1; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>4){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>5){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>6){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>7){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>7){ \
|
||||||
|
/* InjectKey(r=2) */ \
|
||||||
|
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||||
|
X3 += 2; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>8){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>9){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>10){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>11){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>11){ \
|
||||||
|
/* InjectKey(r=3) */ \
|
||||||
|
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||||
|
X3 += 3; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>12){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>13){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>14){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>15){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>15){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||||
|
X3 += 4; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>16){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>17){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>18){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>19){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>19){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||||
|
X3 += 5; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>20){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>21){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>22){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>23){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>23){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||||
|
X3 += 6; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>24){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>25){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>26){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>27){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>27){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||||
|
X3 += 7; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>28){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>29){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>30){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>31){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>31){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||||
|
X3 += 8; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>32){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>33){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>34){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>35){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>35){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||||
|
X3 += 9; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>36){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>37){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>38){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>39){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>39){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||||
|
X3 += 10; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>40){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>41){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>42){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>43){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>43){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||||
|
X3 += 11; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>44){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>45){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>46){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>47){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>47){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||||
|
X3 += 12; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>48){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>49){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>50){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>51){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>51){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||||
|
X3 += 13; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>52){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>53){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>54){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>55){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>55){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||||
|
X3 += 14; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>56){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>57){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>58){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>59){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>59){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||||
|
X3 += 15; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>60){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>61){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>62){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>63){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>63){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||||
|
X3 += 16; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>64){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>65){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>66){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>67){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>67){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||||
|
X3 += 17; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if(Nrounds>68){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>69){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>70){ \
|
||||||
|
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||||
|
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>71){ \
|
||||||
|
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||||
|
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||||
|
} \
|
||||||
|
if(Nrounds>71){ \
|
||||||
|
/* InjectKey(r=1) */ \
|
||||||
|
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||||
|
X3 += 18; /* XWCNT4-1 += r */ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/** @ingroup ThreefryNxW */ \
|
||||||
|
enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
|
||||||
|
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||||
|
threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
|
||||||
|
return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#if R123_USE_64BIT
|
||||||
|
_threefry2x_tpl(64)
|
||||||
|
_threefry4x_tpl(64)
|
||||||
|
#endif
|
||||||
|
_threefry2x_tpl(32)
|
||||||
|
_threefry4x_tpl(32)
|
||||||
|
|
||||||
|
/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
|
||||||
|
than a static inline function. Why? */
|
||||||
|
#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
|
||||||
|
#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
|
||||||
|
#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
|
||||||
|
#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
#define _threefryNxWclass_tpl(NxW) \
|
||||||
|
namespace r123{ \
|
||||||
|
template<unsigned int ROUNDS> \
|
||||||
|
struct Threefry##NxW##_R{ \
|
||||||
|
typedef threefry##NxW##_ctr_t ctr_type; \
|
||||||
|
typedef threefry##NxW##_key_t key_type; \
|
||||||
|
typedef threefry##NxW##_key_t ukey_type; \
|
||||||
|
static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
|
||||||
|
inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
|
||||||
|
R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
|
||||||
|
return threefry##NxW##_R(ROUNDS, ctr, key); \
|
||||||
|
} \
|
||||||
|
}; \
|
||||||
|
typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
|
||||||
|
} // namespace r123
|
||||||
|
|
||||||
|
_threefryNxWclass_tpl(2x32)
|
||||||
|
_threefryNxWclass_tpl(4x32)
|
||||||
|
#if R123_USE_64BIT
|
||||||
|
_threefryNxWclass_tpl(2x64)
|
||||||
|
_threefryNxWclass_tpl(4x64)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The _tpl macros don't quite work to do string-pasting inside comments.
|
||||||
|
so we just write out the boilerplate documentation four times... */
|
||||||
|
|
||||||
|
/**
|
||||||
|
@defgroup ThreefryNxW Threefry Classes and Typedefs
|
||||||
|
|
||||||
|
The ThreefryNxW classes export the member functions, typedefs and
|
||||||
|
operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||||
|
|
||||||
|
As described in
|
||||||
|
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
|
||||||
|
the Threefry family is closely related to the Threefish block cipher from
|
||||||
|
<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
|
||||||
|
Threefry is \b not suitable for cryptographic use.
|
||||||
|
|
||||||
|
Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
|
||||||
|
|
||||||
|
@class r123::Threefry2x32_R
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
|
||||||
|
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||||
|
|
||||||
|
The template argument, ROUNDS, is the number of times the Threefry round
|
||||||
|
function will be applied.
|
||||||
|
|
||||||
|
As of September 2011, the authors know of no statistical flaws with
|
||||||
|
ROUNDS=13 or more for Threefry2x32.
|
||||||
|
|
||||||
|
@typedef r123::Threefry2x32
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
|
||||||
|
Threefry2x32 has a considerable safety margin over the minimum number
|
||||||
|
of rounds with no known statistical flaws, but still has excellent
|
||||||
|
performance.
|
||||||
|
|
||||||
|
@class r123::Threefry2x64_R
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
|
||||||
|
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||||
|
|
||||||
|
The template argument, ROUNDS, is the number of times the Threefry round
|
||||||
|
function will be applied.
|
||||||
|
|
||||||
|
In November 2011, the authors discovered that 13 rounds of
|
||||||
|
Threefry2x64 sequenced by strided, interleaved key and counter
|
||||||
|
increments failed a very long (longer than the default BigCrush
|
||||||
|
length) WeightDistrub test. At the same time, it was confirmed that
|
||||||
|
14 rounds passes much longer tests (up to 5x10^12 samples) of a
|
||||||
|
similar nature. The authors know of no statistical flaws with
|
||||||
|
ROUNDS=14 or more for Threefry2x64.
|
||||||
|
|
||||||
|
@typedef r123::Threefry2x64
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
|
||||||
|
Threefry2x64 has a considerable safety margin over the minimum number
|
||||||
|
of rounds with no known statistical flaws, but still has excellent
|
||||||
|
performance.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@class r123::Threefry4x32_R
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
|
||||||
|
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||||
|
|
||||||
|
The template argument, ROUNDS, is the number of times the Threefry round
|
||||||
|
function will be applied.
|
||||||
|
|
||||||
|
As of September 2011, the authors know of no statistical flaws with
|
||||||
|
ROUNDS=12 or more for Threefry4x32.
|
||||||
|
|
||||||
|
@typedef r123::Threefry4x32
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
|
||||||
|
Threefry4x32 has a considerable safety margin over the minimum number
|
||||||
|
of rounds with no known statistical flaws, but still has excellent
|
||||||
|
performance.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@class r123::Threefry4x64_R
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
|
||||||
|
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||||
|
|
||||||
|
The template argument, ROUNDS, is the number of times the Threefry round
|
||||||
|
function will be applied.
|
||||||
|
|
||||||
|
As of September 2011, the authors know of no statistical flaws with
|
||||||
|
ROUNDS=12 or more for Threefry4x64.
|
||||||
|
|
||||||
|
@typedef r123::Threefry4x64
|
||||||
|
@ingroup ThreefryNxW
|
||||||
|
Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
|
||||||
|
Threefry4x64 has a considerable safety margin over the minimum number
|
||||||
|
of rounds with no known statistical flaws, but still has excellent
|
||||||
|
performance.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
1305
external/panphasia_ho/uniform_rand_threefry4x64.c
vendored
Normal file
1305
external/panphasia_ho/uniform_rand_threefry4x64.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
|
@ -136,16 +136,14 @@ inline void HDFGetDatasetExtent( const std::string Filename, const std::string O
|
||||||
|
|
||||||
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
||||||
|
|
||||||
hsize_t *dimsize = new hsize_t[ndims];
|
std::vector<hsize_t> dimsize(ndims,0);
|
||||||
|
|
||||||
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
|
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
|
||||||
|
|
||||||
Extent.clear();
|
Extent.clear();
|
||||||
for(int i=0; i<ndims; ++i )
|
for(int i=0; i<ndims; ++i )
|
||||||
Extent.push_back( dimsize[i] );
|
Extent.push_back( dimsize[i] );
|
||||||
|
|
||||||
delete[] dimsize;
|
|
||||||
|
|
||||||
H5Sclose( HDF_DataspaceID );
|
H5Sclose( HDF_DataspaceID );
|
||||||
H5Dclose( HDF_DatasetID );
|
H5Dclose( HDF_DatasetID );
|
||||||
H5Fclose( HDF_FileID );
|
H5Fclose( HDF_FileID );
|
||||||
|
@ -323,8 +321,8 @@ inline void HDFReadVectorSelect( const std::string Filename, const std::string O
|
||||||
//... get space associated with dataset and its extensions
|
//... get space associated with dataset and its extensions
|
||||||
HDF_DataspaceID = H5Dget_space( HDF_DatasetID );
|
HDF_DataspaceID = H5Dget_space( HDF_DatasetID );
|
||||||
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
||||||
hsize_t dimsize[ndims];
|
std::vector<hsize_t> dimsize(ndims,0);
|
||||||
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
|
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
|
||||||
|
|
||||||
hsize_t block[2];
|
hsize_t block[2];
|
||||||
block[0] = ii.size();
|
block[0] = ii.size();
|
||||||
|
@ -601,9 +599,9 @@ inline void HDFReadGroupAttribute( const std::string Filename, const std::string
|
||||||
|
|
||||||
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
int ndims = H5Sget_simple_extent_ndims( HDF_DataspaceID );
|
||||||
|
|
||||||
hsize_t dimsize[ndims];
|
std::vector<hsize_t> dimsize(ndims,0);
|
||||||
|
|
||||||
H5Sget_simple_extent_dims( HDF_DataspaceID, dimsize, NULL );
|
H5Sget_simple_extent_dims( HDF_DataspaceID, &dimsize[0], NULL );
|
||||||
|
|
||||||
HDF_StorageSize = 1;
|
HDF_StorageSize = 1;
|
||||||
for(int i=0; i<ndims; ++i )
|
for(int i=0; i<ndims; ++i )
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
|
|
||||||
|
|
||||||
#define USE_PRECISION_${CODE_PRECISION}
|
#define USE_PRECISION_${CODE_PRECISION}
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
|
||||||
#if defined(USE_PRECISION_FLOAT)
|
#if defined(USE_PRECISION_FLOAT)
|
||||||
constexpr char CMAKE_PRECISION_STR[] = "single";
|
constexpr char CMAKE_PRECISION_STR[] = "single";
|
||||||
#elif defined(USE_PRECISION_DOUBLE)
|
#elif defined(USE_PRECISION_DOUBLE)
|
||||||
|
@ -31,4 +33,6 @@ extern "C"
|
||||||
extern const char *GIT_TAG;
|
extern const char *GIT_TAG;
|
||||||
extern const char *GIT_REV;
|
extern const char *GIT_REV;
|
||||||
extern const char *GIT_BRANCH;
|
extern const char *GIT_BRANCH;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif // __cplusplus
|
|
@ -216,6 +216,10 @@ namespace particle
|
||||||
const size_t num_p_in_load = field.local_size();
|
const size_t num_p_in_load = field.local_size();
|
||||||
const real_t pmeanmass = munit / real_t(field.global_size()* overload);
|
const real_t pmeanmass = munit / real_t(field.global_size()* overload);
|
||||||
|
|
||||||
|
bool bmass_negative = false;
|
||||||
|
auto mean_pm = field.mean() * pmeanmass;
|
||||||
|
auto std_pm = field.std() * pmeanmass;
|
||||||
|
|
||||||
for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
|
for (int ishift = 0; ishift < (1 << lattice_type); ++ishift)
|
||||||
{
|
{
|
||||||
// if we are dealing with the secondary lattice, apply a global shift
|
// if we are dealing with the secondary lattice, apply a global shift
|
||||||
|
@ -237,18 +241,25 @@ namespace particle
|
||||||
{
|
{
|
||||||
for (size_t k = 0; k < field.size(2); ++k)
|
for (size_t k = 0; k < field.size(2); ++k)
|
||||||
{
|
{
|
||||||
if (b64reals)
|
// get
|
||||||
{
|
const auto pmass = pmeanmass * field.relem(i, j, k);
|
||||||
particles_.set_mass64(ipcount++, pmeanmass * field.relem(i, j, k));
|
|
||||||
}
|
// check for negative mass
|
||||||
else
|
bmass_negative |= pmass<0.0;
|
||||||
{
|
|
||||||
particles_.set_mass32(ipcount++, pmeanmass * field.relem(i, j, k));
|
// set
|
||||||
}
|
if (b64reals) particles_.set_mass64(ipcount++, pmass);
|
||||||
|
else particles_.set_mass32(ipcount++, pmass);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// diagnostics
|
||||||
|
music::ilog << "Particle Mass : mean/munit = " << mean_pm/munit << " ; fractional RMS = " << std_pm / mean_pm * 100.0 << "%" << std::endl;
|
||||||
|
if(std_pm / mean_pm > 0.1 ) music::wlog << "Particle mass perturbation larger than 10%, consider decreasing \n\t the starting redshift or disabling baryon decaying modes." << std::endl;
|
||||||
|
if(bmass_negative) music::elog << "Negative particle mass produced! Decrease the starting \n\t redshift or disable baryon decaying modes!" << std::endl;
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
// should not happen
|
// should not happen
|
||||||
music::elog << "Cannot have individual particle masses for glasses!" << std::endl;
|
music::elog << "Cannot have individual particle masses for glasses!" << std::endl;
|
||||||
|
|
|
@ -591,6 +591,10 @@ int run( config_file& the_config )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if defined(USE_MPI)
|
||||||
|
real_t local_maxdphi = maxdphi;
|
||||||
|
MPI_Allreduce( &local_maxdphi, &maxdphi, 1, MPI::get_datatype<real_t>(), MPI_MAX, MPI_COMM_WORLD );
|
||||||
|
#endif
|
||||||
const real_t hbar_safefac = 1.01;
|
const real_t hbar_safefac = 1.01;
|
||||||
const real_t hbar = maxdphi / M_PI / Dplus0 * hbar_safefac;
|
const real_t hbar = maxdphi / M_PI / Dplus0 * hbar_safefac;
|
||||||
music::ilog << "Semiclassical PT : hbar = " << hbar << " (limited by initial potential, safety=" << hbar_safefac << ")." << std::endl;
|
music::ilog << "Semiclassical PT : hbar = " << hbar << " (limited by initial potential, safety=" << hbar_safefac << ")." << std::endl;
|
||||||
|
@ -832,6 +836,9 @@ int run( config_file& the_config )
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
music::ilog << "-------------------------------------------------------------------------------" << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
498
src/plugins/PANPHASIA.hh
Normal file
498
src/plugins/PANPHASIA.hh
Normal file
|
@ -0,0 +1,498 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace PANPHASIA1
|
||||||
|
{
|
||||||
|
const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
|
||||||
|
typedef int rand_offset_[5];
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
|
||||||
|
int need_fill;
|
||||||
|
int pos;
|
||||||
|
} rand_state_;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int base_state[5], base_lev_start[5][maxdim + 1];
|
||||||
|
rand_offset_ poweroffset[maxpow + 1], superjump;
|
||||||
|
rand_state_ current_state[maxpow + 2];
|
||||||
|
|
||||||
|
int layer_min, layer_max, indep_field;
|
||||||
|
|
||||||
|
long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
|
||||||
|
int lev_common, layer_min_store, layer_max_store;
|
||||||
|
long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
|
||||||
|
iy_rel_store, iz_rel_store;
|
||||||
|
double exp_coeffs[8][8][maxdim + 2];
|
||||||
|
long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
|
||||||
|
int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
|
||||||
|
|
||||||
|
double cell_data[9][8];
|
||||||
|
int ixh_last, iyh_last, izh_last;
|
||||||
|
int init;
|
||||||
|
|
||||||
|
int init_cell_props;
|
||||||
|
int init_lecuyer_state;
|
||||||
|
long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
|
||||||
|
|
||||||
|
} pan_state_;
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
{
|
||||||
|
void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
|
||||||
|
|
||||||
|
void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
|
||||||
|
int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
|
||||||
|
|
||||||
|
void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
|
||||||
|
|
||||||
|
void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
|
||||||
|
int *layer_max, int *indep_field, double *cell_prop);
|
||||||
|
|
||||||
|
void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
|
||||||
|
long long *iy_rel, long long *iz_rel, int *VERBOSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct RNG
|
||||||
|
{
|
||||||
|
|
||||||
|
config_file *pcf_;
|
||||||
|
|
||||||
|
struct panphasia_descriptor
|
||||||
|
{
|
||||||
|
int16_t wn_level_base;
|
||||||
|
int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
|
||||||
|
int16_t i_base, i_base_y, i_base_z;
|
||||||
|
int32_t check_rand;
|
||||||
|
std::string name;
|
||||||
|
|
||||||
|
explicit panphasia_descriptor(std::string dstring)
|
||||||
|
{
|
||||||
|
char tmp[100];
|
||||||
|
std::memset(tmp, ' ', 100);
|
||||||
|
parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
|
||||||
|
&i_base_y, &i_base_z, &check_rand, tmp);
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
if (tmp[i] == ' ')
|
||||||
|
{
|
||||||
|
tmp[i] = '\0';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
name = tmp;
|
||||||
|
name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// greatest common divisor
|
||||||
|
int gcd(int a, int b)
|
||||||
|
{
|
||||||
|
if (b == 0)
|
||||||
|
return a;
|
||||||
|
return gcd(b, a % b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// least common multiple
|
||||||
|
int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
|
||||||
|
|
||||||
|
// Two or largest power of 2 less than the argument
|
||||||
|
int largest_power_two_lte(int b)
|
||||||
|
{
|
||||||
|
int a = 1;
|
||||||
|
if (b <= a)
|
||||||
|
return a;
|
||||||
|
while (2 * a < b)
|
||||||
|
a = 2 * a;
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string descriptor_string_;
|
||||||
|
int num_threads_;
|
||||||
|
int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
|
||||||
|
bool incongruent_fields_;
|
||||||
|
double inter_grid_phase_adjustment_;
|
||||||
|
// double translation_phase_;
|
||||||
|
pan_state_ *lstate;
|
||||||
|
int grid_p_;
|
||||||
|
int coordinate_system_shift_[3];
|
||||||
|
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
|
||||||
|
|
||||||
|
void clear_panphasia_thread_states(void)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < num_threads_; ++i)
|
||||||
|
{
|
||||||
|
lstate[i].init = 0;
|
||||||
|
lstate[i].init_cell_props = 0;
|
||||||
|
lstate[i].init_lecuyer_state = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void initialize_for_grid_structure(void)
|
||||||
|
{
|
||||||
|
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
|
||||||
|
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
|
||||||
|
int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution", 512);
|
||||||
|
|
||||||
|
grid_p_ = pdescriptor_->i_base;
|
||||||
|
|
||||||
|
lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
|
||||||
|
// lmin
|
||||||
|
|
||||||
|
ngrid_panphasia_ = (1 << lextra_) * grid_p_;
|
||||||
|
|
||||||
|
while (ngrid_panphasia_ < ngridminsize_panphasia)
|
||||||
|
{
|
||||||
|
lextra_++;
|
||||||
|
ngrid_panphasia_ *= 2;
|
||||||
|
}
|
||||||
|
assert(ngrid_panphasia_ >= ngridminsize_panphasia);
|
||||||
|
|
||||||
|
clear_panphasia_thread_states();
|
||||||
|
|
||||||
|
music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)", ngrid_panphasia_, lextra_);
|
||||||
|
if (ngridminsize_panphasia < 512)
|
||||||
|
music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512", ngridminsize_panphasia);
|
||||||
|
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_);
|
||||||
|
|
||||||
|
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
|
||||||
|
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
|
||||||
|
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<panphasia_descriptor> pdescriptor_;
|
||||||
|
|
||||||
|
RNG(config_file *pcf)
|
||||||
|
: pcf_(pcf)
|
||||||
|
{
|
||||||
|
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
|
num_threads_ = omp_get_max_threads();
|
||||||
|
#else
|
||||||
|
num_threads_ = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// create independent state descriptions for each thread
|
||||||
|
lstate = new pan_state_[num_threads_];
|
||||||
|
|
||||||
|
// parse the descriptor for its properties
|
||||||
|
pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
|
||||||
|
|
||||||
|
music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
|
||||||
|
|
||||||
|
// write panphasia base size into config file for the grid construction
|
||||||
|
// as the gridding unit we use the least common multiple of 2 and i_base
|
||||||
|
std::stringstream ss;
|
||||||
|
//ARJ ss << lcm(2, pdescriptor_->i_base);
|
||||||
|
//ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
|
||||||
|
ss << 2; //ARJ - set gridding unit to two
|
||||||
|
pcf_->insert_value("setup", "gridding_unit", ss.str());
|
||||||
|
ss.str(std::string());
|
||||||
|
ss << pdescriptor_->i_base;
|
||||||
|
pcf_->insert_value("random", "base_unit", ss.str());
|
||||||
|
|
||||||
|
this->initialize_for_grid_structure();
|
||||||
|
}
|
||||||
|
|
||||||
|
~RNG() { delete[] lstate; }
|
||||||
|
|
||||||
|
void Fill(Grid_FFT<real_t> &g)
|
||||||
|
{
|
||||||
|
auto sinc = [](real_t x)
|
||||||
|
{ return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
|
||||||
|
auto dsinc = [](real_t x)
|
||||||
|
{ return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
|
||||||
|
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
|
||||||
|
|
||||||
|
// we will overwrite 'g', we can deallocate it while we prepare the panphasia field
|
||||||
|
g.reset();
|
||||||
|
|
||||||
|
clear_panphasia_thread_states();
|
||||||
|
|
||||||
|
// temporaries
|
||||||
|
Grid_FFT<real_t> g0({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
|
||||||
|
Grid_FFT<real_t> g1({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
|
||||||
|
Grid_FFT<real_t> g2({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
|
||||||
|
Grid_FFT<real_t> g3({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
|
||||||
|
Grid_FFT<real_t> g4({size_t(ngrid_panphasia_), size_t(ngrid_panphasia_), size_t(ngrid_panphasia_)}, g.length_);
|
||||||
|
|
||||||
|
double t1 = get_wtime();
|
||||||
|
// double tp = t1;
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
#ifdef _OPENMP
|
||||||
|
const int mythread = omp_get_thread_num();
|
||||||
|
#else
|
||||||
|
const int mythread = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//int odd_x, odd_y, odd_z;
|
||||||
|
//int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
|
||||||
|
|
||||||
|
int verbosity = (mythread == 0);
|
||||||
|
char descriptor[100];
|
||||||
|
std::memset(descriptor, 0, 100);
|
||||||
|
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
||||||
|
|
||||||
|
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
||||||
|
|
||||||
|
{
|
||||||
|
panphasia_descriptor d(descriptor_string_);
|
||||||
|
|
||||||
|
int level_p = d.wn_level_base + lextra_;
|
||||||
|
|
||||||
|
lstate[mythread].layer_min = 0;
|
||||||
|
lstate[mythread].layer_max = level_p;
|
||||||
|
lstate[mythread].indep_field = 1;
|
||||||
|
|
||||||
|
long long ix_rel[3];
|
||||||
|
ix_rel[0] = 0; //ileft_corner_p[0];
|
||||||
|
ix_rel[1] = 0; //ileft_corner_p[1];
|
||||||
|
ix_rel[2] = 0; //ileft_corner_p[2];
|
||||||
|
|
||||||
|
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
|
||||||
|
&verbosity);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity)
|
||||||
|
t1 = get_wtime();
|
||||||
|
|
||||||
|
std::array<double, 9> cell_prop;
|
||||||
|
pan_state_ *ps = &lstate[mythread];
|
||||||
|
|
||||||
|
#pragma omp for //nowait
|
||||||
|
for (size_t i = 0; i < g0.size(0); i += 2)
|
||||||
|
{
|
||||||
|
const int ixmax(std::min<int>(2, g0.size(0) - i));
|
||||||
|
for (size_t j = 0; j < g0.size(1); j += 2)
|
||||||
|
{
|
||||||
|
const int iymax(std::min<int>(2, g0.size(1) - j));
|
||||||
|
for (size_t k = 0; k < g0.size(2); k += 2)
|
||||||
|
{
|
||||||
|
const int izmax(std::min<int>(2, g0.size(2) - k));
|
||||||
|
|
||||||
|
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
||||||
|
for (int ix = 0; ix < ixmax; ++ix)
|
||||||
|
{
|
||||||
|
for (int iy = 0; iy < iymax; ++iy)
|
||||||
|
{
|
||||||
|
for (int iz = 0; iz < izmax; ++iz)
|
||||||
|
{
|
||||||
|
int ilocal = i + ix;
|
||||||
|
int jlocal = j + iy;
|
||||||
|
int klocal = k + iz;
|
||||||
|
|
||||||
|
int iglobal = ilocal + g0.local_0_start_;
|
||||||
|
int jglobal = jlocal;
|
||||||
|
int kglobal = klocal;
|
||||||
|
|
||||||
|
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
|
||||||
|
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
|
||||||
|
|
||||||
|
g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
|
||||||
|
g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
|
||||||
|
g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
|
||||||
|
g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
|
||||||
|
g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // end omp parallel region
|
||||||
|
|
||||||
|
g0.FourierTransformForward();
|
||||||
|
g1.FourierTransformForward();
|
||||||
|
g2.FourierTransformForward();
|
||||||
|
g3.FourierTransformForward();
|
||||||
|
g4.FourierTransformForward();
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (size_t i = 0; i < g0.size(0); i++)
|
||||||
|
{
|
||||||
|
for (size_t j = 0; j < g0.size(1); j++)
|
||||||
|
{
|
||||||
|
for (size_t k = 0; k < g0.size(2); k++)
|
||||||
|
{
|
||||||
|
if (!g0.is_nyquist_mode(i, j, k))
|
||||||
|
{
|
||||||
|
auto kvec = g0.get_k<real_t>(i, j, k);
|
||||||
|
|
||||||
|
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
||||||
|
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
||||||
|
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
||||||
|
|
||||||
|
auto fx = real_t(sinc(argx));
|
||||||
|
auto gx = ccomplex_t(0.0, dsinc(argx));
|
||||||
|
auto fy = real_t(sinc(argy));
|
||||||
|
auto gy = ccomplex_t(0.0, dsinc(argy));
|
||||||
|
auto fz = real_t(sinc(argz));
|
||||||
|
auto gz = ccomplex_t(0.0, dsinc(argz));
|
||||||
|
|
||||||
|
auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
|
||||||
|
auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
|
||||||
|
|
||||||
|
auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
||||||
|
|
||||||
|
g0.kelem(i, j, k) = y0 * fx * fy * fz + sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz) + y4 * magnitude;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
g0.kelem(i, j, k) = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
|
||||||
|
// tp = get_wtime();
|
||||||
|
|
||||||
|
g1.FourierTransformBackward(false);
|
||||||
|
g2.FourierTransformBackward(false);
|
||||||
|
g3.FourierTransformBackward(false);
|
||||||
|
g4.FourierTransformBackward(false);
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
#ifdef _OPENMP
|
||||||
|
const int mythread = omp_get_thread_num();
|
||||||
|
#else
|
||||||
|
const int mythread = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// int odd_x, odd_y, odd_z;
|
||||||
|
int verbosity = (mythread == 0);
|
||||||
|
char descriptor[100];
|
||||||
|
std::memset(descriptor, 0, 100);
|
||||||
|
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
||||||
|
|
||||||
|
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
||||||
|
|
||||||
|
{
|
||||||
|
panphasia_descriptor d(descriptor_string_);
|
||||||
|
|
||||||
|
int level_p = d.wn_level_base + lextra_;
|
||||||
|
|
||||||
|
lstate[mythread].layer_min = 0;
|
||||||
|
lstate[mythread].layer_max = level_p;
|
||||||
|
lstate[mythread].indep_field = 1;
|
||||||
|
|
||||||
|
long long ix_rel[3];
|
||||||
|
ix_rel[0] = 0; //ileft_corner_p[0];
|
||||||
|
ix_rel[1] = 0; //ileft_corner_p[1];
|
||||||
|
ix_rel[2] = 0; //ileft_corner_p[2];
|
||||||
|
|
||||||
|
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
|
||||||
|
&verbosity);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity)
|
||||||
|
t1 = get_wtime();
|
||||||
|
|
||||||
|
//***************************************************************
|
||||||
|
// Process Panphasia values: p110, p011, p101, p111
|
||||||
|
//****************************************************************
|
||||||
|
std::array<double, 9> cell_prop;
|
||||||
|
pan_state_ *ps = &lstate[mythread];
|
||||||
|
|
||||||
|
#pragma omp for //nowait
|
||||||
|
for (size_t i = 0; i < g1.size(0); i += 2)
|
||||||
|
{
|
||||||
|
const int ixmax(std::min<int>(2, g1.size(0) - i));
|
||||||
|
for (size_t j = 0; j < g1.size(1); j += 2)
|
||||||
|
{
|
||||||
|
const int iymax(std::min<int>(2, g1.size(1) - j));
|
||||||
|
for (size_t k = 0; k < g1.size(2); k += 2)
|
||||||
|
{
|
||||||
|
const int izmax(std::min<int>(2, g1.size(2) - k));
|
||||||
|
|
||||||
|
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
||||||
|
for (int ix = 0; ix < ixmax; ++ix)
|
||||||
|
{
|
||||||
|
for (int iy = 0; iy < iymax; ++iy)
|
||||||
|
{
|
||||||
|
for (int iz = 0; iz < izmax; ++iz)
|
||||||
|
{
|
||||||
|
int ilocal = i + ix;
|
||||||
|
int jlocal = j + iy;
|
||||||
|
int klocal = k + iz;
|
||||||
|
|
||||||
|
int iglobal = ilocal + g1.local_0_start_;
|
||||||
|
int jglobal = jlocal;
|
||||||
|
int kglobal = klocal;
|
||||||
|
|
||||||
|
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
|
||||||
|
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
|
||||||
|
|
||||||
|
g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
|
||||||
|
g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
|
||||||
|
g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
|
||||||
|
g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // end omp parallel region
|
||||||
|
|
||||||
|
// music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
|
||||||
|
// tp = get_wtime();
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
// transform and convolve with Legendres
|
||||||
|
g1.FourierTransformForward();
|
||||||
|
g2.FourierTransformForward();
|
||||||
|
g3.FourierTransformForward();
|
||||||
|
g4.FourierTransformForward();
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (size_t i = 0; i < g0.size(0); i++)
|
||||||
|
{
|
||||||
|
for (size_t j = 0; j < g0.size(1); j++)
|
||||||
|
{
|
||||||
|
for (size_t k = 0; k < g0.size(2); k++)
|
||||||
|
{
|
||||||
|
if (!g0.is_nyquist_mode(i, j, k))
|
||||||
|
{
|
||||||
|
auto kvec = g0.get_k<real_t>(i, j, k);
|
||||||
|
|
||||||
|
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
||||||
|
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
||||||
|
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
||||||
|
|
||||||
|
auto fx = real_t(sinc(argx));
|
||||||
|
auto gx = ccomplex_t(0.0, dsinc(argx));
|
||||||
|
auto fy = real_t(sinc(argy));
|
||||||
|
auto gy = ccomplex_t(0.0, dsinc(argy));
|
||||||
|
auto fz = real_t(sinc(argz));
|
||||||
|
auto gz = ccomplex_t(0.0, dsinc(argz));
|
||||||
|
|
||||||
|
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
||||||
|
|
||||||
|
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
|
||||||
|
|
||||||
|
// do final phase shift to account for corner centered coordinates vs. cell centers
|
||||||
|
auto phase_shift = -0.5 * M_PI * (kvec[0] / g0.kny_[0] + kvec[1] / g0.kny_[1] + kvec[2] / g0.kny_[2]);
|
||||||
|
|
||||||
|
g0.kelem(i, j, k) *= std::exp(ccomplex_t(0, phase_shift));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
g1.reset();
|
||||||
|
g2.reset();
|
||||||
|
g3.reset();
|
||||||
|
g4.reset();
|
||||||
|
|
||||||
|
g.allocate();
|
||||||
|
g0.FourierInterpolateCopyTo(g);
|
||||||
|
|
||||||
|
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
|
||||||
|
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
|
||||||
|
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
|
@ -51,7 +51,7 @@ protected:
|
||||||
real_t lunit_, vunit_, munit_, omegab_;
|
real_t lunit_, vunit_, munit_, omegab_;
|
||||||
uint32_t levelmin_;
|
uint32_t levelmin_;
|
||||||
bool bhavebaryons_;
|
bool bhavebaryons_;
|
||||||
std::vector<float> data_buf_;
|
std::vector<float> data_buf_, data_buf_write_;
|
||||||
std::string dirname_;
|
std::string dirname_;
|
||||||
bool bUseSPT_;
|
bool bUseSPT_;
|
||||||
|
|
||||||
|
@ -207,23 +207,21 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
|
||||||
std::string file_name = this->get_file_name(s, c);
|
std::string file_name = this->get_file_name(s, c);
|
||||||
|
|
||||||
// serialize parallel write
|
// serialize parallel write
|
||||||
for (int write_rank = 0; write_rank < CONFIG::MPI_task_size; ++write_rank)
|
if (CONFIG::MPI_task_rank == 0)
|
||||||
{
|
|
||||||
if (write_rank == CONFIG::MPI_task_rank)
|
|
||||||
{
|
|
||||||
if (write_rank == 0)
|
|
||||||
{
|
{
|
||||||
unlink(file_name.c_str());
|
unlink(file_name.c_str());
|
||||||
}
|
}
|
||||||
std::ofstream ofs(file_name.c_str(), std::ios::binary|std::ios::app);
|
|
||||||
|
std::ofstream *pofs;
|
||||||
|
|
||||||
// write header or seek to end of file
|
// write header or seek to end of file
|
||||||
if (write_rank == 0)
|
if (CONFIG::MPI_task_rank == 0)
|
||||||
{
|
{
|
||||||
|
pofs = new std::ofstream(file_name.c_str(), std::ios::binary|std::ios::app);
|
||||||
uint32_t blocksz = sizeof(header);
|
uint32_t blocksz = sizeof(header);
|
||||||
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
|
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
|
||||||
ofs.write(reinterpret_cast<const char *>(&header_), blocksz);
|
pofs->write(reinterpret_cast<const char *>(&header_), blocksz);
|
||||||
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
|
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
// check field size against buffer size...
|
// check field size against buffer size...
|
||||||
|
@ -231,29 +229,38 @@ void grafic2_output_plugin::write_grid_data(const Grid_FFT<real_t> &g, const cos
|
||||||
assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
|
assert( g.global_size(0) == ngrid && g.global_size(1) == ngrid && g.global_size(2) == ngrid);
|
||||||
assert( g.size(1) == ngrid && g.size(2) == ngrid);
|
assert( g.size(1) == ngrid && g.size(2) == ngrid);
|
||||||
// write actual field slice by slice
|
// write actual field slice by slice
|
||||||
|
// std::cerr << write_rank << ">" << g.size(0) << " " << g.size(1) << " " << g.size(2) << std::endl;
|
||||||
for (size_t i = 0; i < g.size(2); ++i)
|
for (size_t i = 0; i < g.size(2); ++i)
|
||||||
{
|
{
|
||||||
|
data_buf_.assign(ngrid * ngrid, 0.0f);
|
||||||
|
|
||||||
for (unsigned j = 0; j < g.size(1); ++j)
|
for (unsigned j = 0; j < g.size(1); ++j)
|
||||||
{
|
{
|
||||||
for (unsigned k = 0; k < g.size(0); ++k)
|
for (unsigned k = 0; k < g.size(0); ++k)
|
||||||
{
|
{
|
||||||
data_buf_[j * ngrid + k] = g.relem(k, j, i);
|
data_buf_[j * ngrid + (k+g.local_0_start_)] = g.relem(k, j, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if defined(USE_MPI)
|
||||||
|
if( CONFIG::MPI_task_rank == 0 ) data_buf_write_.assign(ngrid*ngrid,0.0f);
|
||||||
|
MPI_Reduce( &data_buf_[0], &data_buf_write_[0], ngrid*ngrid, MPI::get_datatype<float>(), MPI_SUM, 0, MPI_COMM_WORLD );
|
||||||
|
if( CONFIG::MPI_task_rank == 0 ) data_buf_.swap(data_buf_write_);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( CONFIG::MPI_task_rank == 0 )
|
||||||
|
{
|
||||||
uint32_t blocksz = ngrid * ngrid * sizeof(float);
|
uint32_t blocksz = ngrid * ngrid * sizeof(float);
|
||||||
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
|
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
|
||||||
ofs.write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
|
pofs->write(reinterpret_cast<const char *>(&data_buf_[0]), blocksz);
|
||||||
ofs.write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
|
pofs->write(reinterpret_cast<const char *>(&blocksz), sizeof(uint32_t));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs.close();
|
if( CONFIG::MPI_task_rank == 0 ){
|
||||||
|
pofs->close();
|
||||||
|
delete pofs;
|
||||||
}
|
}
|
||||||
|
|
||||||
multitask_sync_barrier();
|
|
||||||
|
|
||||||
} // end loop over write_rank
|
|
||||||
|
|
||||||
music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
|
music::ilog << interface_name_ << " : Wrote field to file \'" << file_name << "\'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// This file is part of monofonIC (MUSIC2)
|
// This file is part of monofonIC (MUSIC2)
|
||||||
// A software package to generate ICs for cosmological simulations
|
// A software package to generate ICs for cosmological simulations
|
||||||
// Copyright (C) 2020 by Oliver Hahn and Adrian Jenkins (this file)
|
// Copyright (C) 2021 by Oliver Hahn and Adrian Jenkins (this file)
|
||||||
// but see distinct licensing for PANPHASIA below
|
// but see distinct licensing for PANPHASIA below
|
||||||
//
|
//
|
||||||
// monofonIC is free software: you can redistribute it and/or modify
|
// monofonIC is free software: you can redistribute it and/or modify
|
||||||
|
@ -39,108 +39,16 @@
|
||||||
|
|
||||||
#include <grid_fft.hh>
|
#include <grid_fft.hh>
|
||||||
|
|
||||||
const int maxdim = 60, maxlev = 50, maxpow = 3 * maxdim;
|
namespace PANPHASIA2
|
||||||
typedef int rand_offset_[5];
|
|
||||||
typedef struct
|
|
||||||
{
|
{
|
||||||
int state[133]; // Nstore = Nstate (=5) + Nbatch (=128)
|
extern "C"
|
||||||
int need_fill;
|
|
||||||
int pos;
|
|
||||||
} rand_state_;
|
|
||||||
|
|
||||||
/* pan_state_ struct -- corresponds to respective fortran module in panphasia_routines.f
|
|
||||||
* data structure that contains all panphasia state variables
|
|
||||||
* it needs to get passed between the fortran routines to enable
|
|
||||||
* thread-safe execution.
|
|
||||||
*/
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
int base_state[5], base_lev_start[5][maxdim + 1];
|
|
||||||
rand_offset_ poweroffset[maxpow + 1], superjump;
|
|
||||||
rand_state_ current_state[maxpow + 2];
|
|
||||||
|
|
||||||
int layer_min, layer_max, indep_field;
|
|
||||||
|
|
||||||
long long xorigin_store[2][2][2], yorigin_store[2][2][2], zorigin_store[2][2][2];
|
|
||||||
int lev_common, layer_min_store, layer_max_store;
|
|
||||||
long long ix_abs_store, iy_abs_store, iz_abs_store, ix_per_store, iy_per_store, iz_per_store, ix_rel_store,
|
|
||||||
iy_rel_store, iz_rel_store;
|
|
||||||
double exp_coeffs[8][8][maxdim + 2];
|
|
||||||
long long xcursor[maxdim + 1], ycursor[maxdim + 1], zcursor[maxdim + 1];
|
|
||||||
int ixshift[2][2][2], iyshift[2][2][2], izshift[2][2][2];
|
|
||||||
|
|
||||||
double cell_data[9][8];
|
|
||||||
int ixh_last, iyh_last, izh_last;
|
|
||||||
int init;
|
|
||||||
|
|
||||||
int init_cell_props;
|
|
||||||
int init_lecuyer_state;
|
|
||||||
long long p_xcursor[62], p_ycursor[62], p_zcursor[62];
|
|
||||||
|
|
||||||
} pan_state_;
|
|
||||||
|
|
||||||
extern "C"
|
|
||||||
{
|
|
||||||
void start_panphasia_(pan_state_ *lstate, const char *descriptor, int *ngrid, int *bverbose);
|
|
||||||
|
|
||||||
void parse_descriptor_(const char *descriptor, int16_t *l, int32_t *ix, int32_t *iy, int32_t *iz, int16_t *side1,
|
|
||||||
int16_t *side2, int16_t *side3, int32_t *check_int, char *name);
|
|
||||||
|
|
||||||
void panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, double *cell_prop);
|
|
||||||
|
|
||||||
void adv_panphasia_cell_properties_(pan_state_ *lstate, int *ixcell, int *iycell, int *izcell, int *layer_min,
|
|
||||||
int *layer_max, int *indep_field, double *cell_prop);
|
|
||||||
|
|
||||||
void set_phases_and_rel_origin_(pan_state_ *lstate, const char *descriptor, int *lev, long long *ix_rel,
|
|
||||||
long long *iy_rel, long long *iz_rel, int *VERBOSE);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct panphasia_descriptor
|
|
||||||
{
|
|
||||||
int16_t wn_level_base;
|
|
||||||
int32_t i_xorigin_base, i_yorigin_base, i_zorigin_base;
|
|
||||||
int16_t i_base, i_base_y, i_base_z;
|
|
||||||
int32_t check_rand;
|
|
||||||
std::string name;
|
|
||||||
|
|
||||||
explicit panphasia_descriptor(std::string dstring)
|
|
||||||
{
|
{
|
||||||
char tmp[100];
|
#include <panphasia_ho/panphasia_functions.h>
|
||||||
std::memset(tmp, ' ', 100);
|
extern size_t descriptor_base_size;
|
||||||
parse_descriptor_(dstring.c_str(), &wn_level_base, &i_xorigin_base, &i_yorigin_base, &i_zorigin_base, &i_base,
|
|
||||||
&i_base_y, &i_base_z, &check_rand, tmp);
|
|
||||||
for (int i = 0; i < 100; i++)
|
|
||||||
if (tmp[i] == ' ')
|
|
||||||
{
|
|
||||||
tmp[i] = '\0';
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
name = tmp;
|
|
||||||
name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// greatest common divisor
|
|
||||||
int gcd(int a, int b)
|
|
||||||
{
|
|
||||||
if (b == 0)
|
|
||||||
return a;
|
|
||||||
return gcd(b, a % b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// least common multiple
|
#include "PANPHASIA.hh"
|
||||||
int lcm(int a, int b) { return abs(a * b) / gcd(a, b); }
|
|
||||||
|
|
||||||
// Two or largest power of 2 less than the argument
|
|
||||||
int largest_power_two_lte(int b)
|
|
||||||
{
|
|
||||||
int a = 1;
|
|
||||||
if (b <= a)
|
|
||||||
return a;
|
|
||||||
while (2 * a < b)
|
|
||||||
a = 2 * a;
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
class RNG_panphasia : public RNG_plugin
|
class RNG_panphasia : public RNG_plugin
|
||||||
{
|
{
|
||||||
|
@ -148,63 +56,15 @@ private:
|
||||||
protected:
|
protected:
|
||||||
std::string descriptor_string_;
|
std::string descriptor_string_;
|
||||||
int num_threads_;
|
int num_threads_;
|
||||||
int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
|
int panphasia_mode_;
|
||||||
bool incongruent_fields_;
|
size_t grid_res_;
|
||||||
double inter_grid_phase_adjustment_;
|
real_t boxlength_;
|
||||||
// double translation_phase_;
|
|
||||||
pan_state_ *lstate;
|
|
||||||
int grid_p_;
|
|
||||||
int coordinate_system_shift_[3];
|
|
||||||
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
|
|
||||||
|
|
||||||
void clear_panphasia_thread_states(void)
|
PANPHASIA1::RNG *ppan1_rng_;
|
||||||
{
|
|
||||||
for (int i = 0; i < num_threads_; ++i)
|
|
||||||
{
|
|
||||||
lstate[i].init = 0;
|
|
||||||
lstate[i].init_cell_props = 0;
|
|
||||||
lstate[i].init_lecuyer_state = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initialize_for_grid_structure(void)
|
|
||||||
{
|
|
||||||
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
|
|
||||||
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
|
|
||||||
int ngridminsize_panphasia = pcf_->get_value_safe<int>("random", "PanphasiaMinRootResolution",512);
|
|
||||||
|
|
||||||
grid_p_ = pdescriptor_->i_base;
|
|
||||||
|
|
||||||
lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
|
|
||||||
// lmin
|
|
||||||
|
|
||||||
ngrid_panphasia_ = (1 << lextra_) * grid_p_;
|
|
||||||
|
|
||||||
while( ngrid_panphasia_ < ngridminsize_panphasia ){
|
|
||||||
lextra_++;
|
|
||||||
ngrid_panphasia_*=2;
|
|
||||||
}
|
|
||||||
assert( ngrid_panphasia_ >= ngridminsize_panphasia);
|
|
||||||
|
|
||||||
|
|
||||||
clear_panphasia_thread_states();
|
|
||||||
|
|
||||||
music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)",ngrid_panphasia_, lextra_);
|
|
||||||
if (ngridminsize_panphasia<512)
|
|
||||||
music::ilog.Print("PANPHASIA WARNING: PanphasiaMinRootResolution = %d below minimum recommended of 512",ngridminsize_panphasia);
|
|
||||||
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_ );
|
|
||||||
|
|
||||||
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
|
|
||||||
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
|
|
||||||
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<panphasia_descriptor> pdescriptor_;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf)
|
explicit RNG_panphasia(config_file &cf) : RNG_plugin(cf)
|
||||||
{
|
{
|
||||||
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
|
|
||||||
|
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
num_threads_ = omp_get_max_threads();
|
num_threads_ = omp_get_max_threads();
|
||||||
|
@ -212,334 +72,118 @@ public:
|
||||||
num_threads_ = 1;
|
num_threads_ = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// create independent state descriptions for each thread
|
descriptor_string_ = pcf_->get_value<std::string>("random", "descriptor");
|
||||||
lstate = new pan_state_[num_threads_];
|
grid_res_ = pcf_->get_value<size_t>("setup", "GridRes");
|
||||||
|
boxlength_ = pcf_->get_value<real_t>("setup", "BoxLength");
|
||||||
|
|
||||||
// parse the descriptor for its properties
|
if( pcf_->get_value<bool>("setup", "DoFixing") ){
|
||||||
pdescriptor_ = std::make_unique<panphasia_descriptor>(descriptor_string_);
|
music::flog << "Fixing all the modes to the mean power negates any advantage of using the Panphasia field.\n";
|
||||||
|
music::flog << "With the new panphasia ho it is possible by choosing the descriptor to fix the largest modes without losing the ability to resimulate to much higher resolution.\n";
|
||||||
music::ilog.Print("PANPHASIA: descriptor \'%s\' is base %d,", pdescriptor_->name.c_str(), pdescriptor_->i_base);
|
throw std::runtime_error("PANPHASIA: incompatible parameter.");
|
||||||
|
|
||||||
// write panphasia base size into config file for the grid construction
|
|
||||||
// as the gridding unit we use the least common multiple of 2 and i_base
|
|
||||||
std::stringstream ss;
|
|
||||||
//ARJ ss << lcm(2, pdescriptor_->i_base);
|
|
||||||
//ss << two_or_largest_power_two_less_than(pdescriptor_->i_base);//ARJ
|
|
||||||
ss << 2; //ARJ - set gridding unit to two
|
|
||||||
pcf_->insert_value("setup", "gridding_unit", ss.str());
|
|
||||||
ss.str(std::string());
|
|
||||||
ss << pdescriptor_->i_base;
|
|
||||||
pcf_->insert_value("random", "base_unit", ss.str());
|
|
||||||
|
|
||||||
this->initialize_for_grid_structure();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~RNG_panphasia() { delete[] lstate; }
|
panphasia_mode_ = 0;
|
||||||
|
PANPHASIA2::parse_and_validate_descriptor_(descriptor_string_.c_str(), &panphasia_mode_);
|
||||||
|
|
||||||
|
if (panphasia_mode_ == 0)
|
||||||
|
{
|
||||||
|
ppan1_rng_ = new PANPHASIA1::RNG(&cf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~RNG_panphasia()
|
||||||
|
{
|
||||||
|
if (panphasia_mode_ == 0)
|
||||||
|
{
|
||||||
|
delete ppan1_rng_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool isMultiscale() const { return true; }
|
bool isMultiscale() const { return true; }
|
||||||
|
|
||||||
|
void Run_Panphasia_Highorder(Grid_FFT<real_t> &g);
|
||||||
|
|
||||||
void Fill_Grid(Grid_FFT<real_t> &g)
|
void Fill_Grid(Grid_FFT<real_t> &g)
|
||||||
{
|
{
|
||||||
auto sinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? std::sin(x) / x : 1.0; };
|
switch (panphasia_mode_)
|
||||||
auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
|
|
||||||
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
|
|
||||||
|
|
||||||
// we will overwrite 'g', we can deallocate it while we prepare the panphasia field
|
|
||||||
g.reset();
|
|
||||||
|
|
||||||
clear_panphasia_thread_states();
|
|
||||||
|
|
||||||
// temporaries
|
|
||||||
Grid_FFT<real_t> g0({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
|
||||||
Grid_FFT<real_t> g1({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
|
||||||
Grid_FFT<real_t> g2({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
|
||||||
Grid_FFT<real_t> g3({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
|
||||||
Grid_FFT<real_t> g4({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
|
||||||
|
|
||||||
double t1 = get_wtime();
|
|
||||||
// double tp = t1;
|
|
||||||
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
{
|
||||||
#ifdef _OPENMP
|
|
||||||
const int mythread = omp_get_thread_num();
|
|
||||||
#else
|
|
||||||
const int mythread = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//int odd_x, odd_y, odd_z;
|
case 0: // old mode
|
||||||
//int ng_level = ngrid_ * (1 << (level - levelmin_)); // full resolution of current level
|
music::ilog << "PANPHASIA: Old descriptor" << std::endl;
|
||||||
|
ppan1_rng_->Fill(g);
|
||||||
|
break;
|
||||||
|
|
||||||
int verbosity = (mythread == 0);
|
case 1: // PANPHASIA HO descriptor
|
||||||
char descriptor[100];
|
music::ilog << "PANPHASIA: New descriptor" << std::endl;
|
||||||
std::memset(descriptor, 0, 100);
|
this->Run_Panphasia_Highorder(g);
|
||||||
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
break;
|
||||||
|
|
||||||
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
default: // unknown PANPHASIA mode
|
||||||
|
music::elog << "PANPHASIA: Something went wrong with descriptor" << std::endl;
|
||||||
{
|
abort();
|
||||||
panphasia_descriptor d(descriptor_string_);
|
break;
|
||||||
|
|
||||||
int level_p = d.wn_level_base + lextra_;
|
|
||||||
|
|
||||||
lstate[mythread].layer_min = 0;
|
|
||||||
lstate[mythread].layer_max = level_p;
|
|
||||||
lstate[mythread].indep_field = 1;
|
|
||||||
|
|
||||||
long long ix_rel[3];
|
|
||||||
ix_rel[0] = 0; //ileft_corner_p[0];
|
|
||||||
ix_rel[1] = 0; //ileft_corner_p[1];
|
|
||||||
ix_rel[2] = 0; //ileft_corner_p[2];
|
|
||||||
|
|
||||||
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
|
|
||||||
&verbosity);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity)
|
|
||||||
t1 = get_wtime();
|
|
||||||
|
|
||||||
std::array<double, 9> cell_prop;
|
|
||||||
pan_state_ *ps = &lstate[mythread];
|
|
||||||
|
|
||||||
#pragma omp for //nowait
|
|
||||||
for (size_t i = 0; i < g0.size(0); i += 2)
|
|
||||||
{
|
|
||||||
const int ixmax(std::min<int>(2,g0.size(0)-i));
|
|
||||||
for (size_t j = 0; j < g0.size(1); j += 2)
|
|
||||||
{
|
|
||||||
const int iymax(std::min<int>(2,g0.size(1)-j));
|
|
||||||
for (size_t k = 0; k < g0.size(2); k += 2)
|
|
||||||
{
|
|
||||||
const int izmax(std::min<int>(2,g0.size(2)-k));
|
|
||||||
|
|
||||||
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
|
||||||
for (int ix = 0; ix < ixmax; ++ix)
|
|
||||||
{
|
|
||||||
for (int iy = 0; iy < iymax; ++iy)
|
|
||||||
{
|
|
||||||
for (int iz = 0; iz < izmax; ++iz)
|
|
||||||
{
|
|
||||||
int ilocal = i + ix;
|
|
||||||
int jlocal = j + iy;
|
|
||||||
int klocal = k + iz;
|
|
||||||
|
|
||||||
int iglobal = ilocal + g0.local_0_start_;
|
|
||||||
int jglobal = jlocal;
|
|
||||||
int kglobal = klocal;
|
|
||||||
|
|
||||||
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
|
|
||||||
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
|
|
||||||
|
|
||||||
g0.relem(ilocal, jlocal, klocal) = cell_prop[0];
|
|
||||||
g1.relem(ilocal, jlocal, klocal) = cell_prop[4];
|
|
||||||
g2.relem(ilocal, jlocal, klocal) = cell_prop[2];
|
|
||||||
g3.relem(ilocal, jlocal, klocal) = cell_prop[1];
|
|
||||||
g4.relem(ilocal, jlocal, klocal) = cell_prop[8];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // end omp parallel region
|
|
||||||
|
|
||||||
g0.FourierTransformForward();
|
|
||||||
g1.FourierTransformForward();
|
|
||||||
g2.FourierTransformForward();
|
|
||||||
g3.FourierTransformForward();
|
|
||||||
g4.FourierTransformForward();
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < g0.size(0); i++)
|
|
||||||
{
|
|
||||||
for (size_t j = 0; j < g0.size(1); j++)
|
|
||||||
{
|
|
||||||
for (size_t k = 0; k < g0.size(2); k++)
|
|
||||||
{
|
|
||||||
if (!g0.is_nyquist_mode(i, j, k))
|
|
||||||
{
|
|
||||||
auto kvec = g0.get_k<real_t>(i, j, k);
|
|
||||||
|
|
||||||
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
|
||||||
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
|
||||||
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
|
||||||
|
|
||||||
auto fx = real_t(sinc(argx));
|
|
||||||
auto gx = ccomplex_t(0.0, dsinc(argx));
|
|
||||||
auto fy = real_t(sinc(argy));
|
|
||||||
auto gy = ccomplex_t(0.0, dsinc(argy));
|
|
||||||
auto fz = real_t(sinc(argz));
|
|
||||||
auto gz = ccomplex_t(0.0, dsinc(argz));
|
|
||||||
|
|
||||||
auto temp = (fx + sqrt3 * gx) * (fy + sqrt3 * gy) * (fz + sqrt3 * gz);
|
|
||||||
auto magnitude = real_t(std::sqrt(1.0 - std::fabs(temp * temp)));
|
|
||||||
|
|
||||||
auto y0(g0.kelem(i, j, k)), y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
|
||||||
|
|
||||||
g0.kelem(i, j, k) = y0 * fx * fy * fz
|
|
||||||
+ sqrt3 * (y1 * gx * fy * fz + y2 * fx * gy * fz + y3 * fx * fy * gz)
|
|
||||||
+ y4 * magnitude;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
g0.kelem(i, j, k) = 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// music::ilog.Print("\033[31mtiming [build panphasia field]: %f s\033[0m", get_wtime() - tp);
|
|
||||||
// tp = get_wtime();
|
|
||||||
|
|
||||||
g1.FourierTransformBackward(false);
|
|
||||||
g2.FourierTransformBackward(false);
|
|
||||||
g3.FourierTransformBackward(false);
|
|
||||||
g4.FourierTransformBackward(false);
|
|
||||||
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
#ifdef _OPENMP
|
|
||||||
const int mythread = omp_get_thread_num();
|
|
||||||
#else
|
|
||||||
const int mythread = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// int odd_x, odd_y, odd_z;
|
|
||||||
int verbosity = (mythread == 0);
|
|
||||||
char descriptor[100];
|
|
||||||
std::memset(descriptor, 0, 100);
|
|
||||||
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
|
||||||
|
|
||||||
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
|
||||||
|
|
||||||
{
|
|
||||||
panphasia_descriptor d(descriptor_string_);
|
|
||||||
|
|
||||||
int level_p = d.wn_level_base + lextra_;
|
|
||||||
|
|
||||||
lstate[mythread].layer_min = 0;
|
|
||||||
lstate[mythread].layer_max = level_p;
|
|
||||||
lstate[mythread].indep_field = 1;
|
|
||||||
|
|
||||||
long long ix_rel[3];
|
|
||||||
ix_rel[0] = 0; //ileft_corner_p[0];
|
|
||||||
ix_rel[1] = 0; //ileft_corner_p[1];
|
|
||||||
ix_rel[2] = 0; //ileft_corner_p[2];
|
|
||||||
|
|
||||||
set_phases_and_rel_origin_(&lstate[mythread], descriptor, &level_p, &ix_rel[0], &ix_rel[1], &ix_rel[2],
|
|
||||||
&verbosity);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (verbosity)
|
|
||||||
t1 = get_wtime();
|
|
||||||
|
|
||||||
//***************************************************************
|
|
||||||
// Process Panphasia values: p110, p011, p101, p111
|
|
||||||
//****************************************************************
|
|
||||||
std::array<double,9> cell_prop;
|
|
||||||
pan_state_ *ps = &lstate[mythread];
|
|
||||||
|
|
||||||
#pragma omp for //nowait
|
|
||||||
for (size_t i = 0; i < g1.size(0); i += 2)
|
|
||||||
{
|
|
||||||
const int ixmax(std::min<int>(2,g1.size(0)-i));
|
|
||||||
for (size_t j = 0; j < g1.size(1); j += 2)
|
|
||||||
{
|
|
||||||
const int iymax(std::min<int>(2,g1.size(1)-j));
|
|
||||||
for (size_t k = 0; k < g1.size(2); k += 2)
|
|
||||||
{
|
|
||||||
const int izmax(std::min<int>(2,g1.size(2)-k));
|
|
||||||
|
|
||||||
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
|
||||||
for (int ix = 0; ix < ixmax; ++ix)
|
|
||||||
{
|
|
||||||
for (int iy = 0; iy < iymax; ++iy)
|
|
||||||
{
|
|
||||||
for (int iz = 0; iz < izmax; ++iz)
|
|
||||||
{
|
|
||||||
int ilocal = i + ix;
|
|
||||||
int jlocal = j + iy;
|
|
||||||
int klocal = k + iz;
|
|
||||||
|
|
||||||
int iglobal = ilocal + g1.local_0_start_;
|
|
||||||
int jglobal = jlocal;
|
|
||||||
int kglobal = klocal;
|
|
||||||
|
|
||||||
adv_panphasia_cell_properties_(ps, &iglobal, &jglobal, &kglobal, &ps->layer_min,
|
|
||||||
&ps->layer_max, &ps->indep_field, &cell_prop[0]);
|
|
||||||
|
|
||||||
g1.relem(ilocal, jlocal, klocal) = cell_prop[6];
|
|
||||||
g2.relem(ilocal, jlocal, klocal) = cell_prop[3];
|
|
||||||
g3.relem(ilocal, jlocal, klocal) = cell_prop[5];
|
|
||||||
g4.relem(ilocal, jlocal, klocal) = cell_prop[7];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // end omp parallel region
|
|
||||||
|
|
||||||
// music::ilog.Print("\033[31mtiming [adv_panphasia_cell_properties2]: %f s \033[0m", get_wtime() - tp);
|
|
||||||
// tp = get_wtime();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
// transform and convolve with Legendres
|
|
||||||
g1.FourierTransformForward();
|
|
||||||
g2.FourierTransformForward();
|
|
||||||
g3.FourierTransformForward();
|
|
||||||
g4.FourierTransformForward();
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < g0.size(0); i++)
|
|
||||||
{
|
|
||||||
for (size_t j = 0; j < g0.size(1); j++)
|
|
||||||
{
|
|
||||||
for (size_t k = 0; k < g0.size(2); k++)
|
|
||||||
{
|
|
||||||
if (!g0.is_nyquist_mode(i, j, k))
|
|
||||||
{
|
|
||||||
auto kvec = g0.get_k<real_t>(i, j, k);
|
|
||||||
|
|
||||||
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
|
||||||
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
|
||||||
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
|
||||||
|
|
||||||
auto fx = real_t(sinc(argx));
|
|
||||||
auto gx = ccomplex_t(0.0, dsinc(argx));
|
|
||||||
auto fy = real_t(sinc(argy));
|
|
||||||
auto gy = ccomplex_t(0.0, dsinc(argy));
|
|
||||||
auto fz = real_t(sinc(argz));
|
|
||||||
auto gz = ccomplex_t(0.0, dsinc(argz));
|
|
||||||
|
|
||||||
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
|
||||||
|
|
||||||
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
|
|
||||||
|
|
||||||
// do final phase shift to account for corner centered coordinates vs. cell centers
|
|
||||||
auto phase_shift = - 0.5 * M_PI * ( kvec[0] / g0.kny_[0]
|
|
||||||
+ kvec[1] /g0.kny_[1] + kvec[2] / g0.kny_[2]);
|
|
||||||
|
|
||||||
g0.kelem(i, j, k) *= std::exp( ccomplex_t(0,phase_shift) );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
g1.reset();
|
|
||||||
g2.reset();
|
|
||||||
g3.reset();
|
|
||||||
g4.reset();
|
|
||||||
|
|
||||||
g.allocate();
|
|
||||||
g0.FourierInterpolateCopyTo( g );
|
|
||||||
|
|
||||||
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
|
|
||||||
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
|
|
||||||
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void RNG_panphasia::Run_Panphasia_Highorder(Grid_FFT<real_t> &g)
|
||||||
|
{
|
||||||
|
int verbose = 0;
|
||||||
|
int error;
|
||||||
|
size_t x0 = 0, y0 = 0, z0 = 0;
|
||||||
|
size_t rel_level;
|
||||||
|
int fdim = 2; //Option to scale Fourier grid dimension relative to Panphasia coefficient grid
|
||||||
|
|
||||||
|
//char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
|
||||||
|
|
||||||
|
PANPHASIA2::PANPHASIA_init_descriptor_(descriptor_string_.c_str(), &verbose);
|
||||||
|
|
||||||
|
printf("Descriptor %s\n ngrid_load %lu\n", descriptor_string_.c_str(), grid_res_);
|
||||||
|
|
||||||
|
// Choose smallest value of level to equal of exceed grid_res_)
|
||||||
|
|
||||||
|
for (rel_level = 0; fdim * (PANPHASIA2::descriptor_base_size <<rel_level) < grid_res_; rel_level++);
|
||||||
|
|
||||||
|
printf("Setting relative level = %lu\n", rel_level);
|
||||||
|
|
||||||
|
if ((error = PANPHASIA2::PANPHASIA_init_level_(&rel_level, &x0, &y0, &z0, &verbose)))
|
||||||
|
{
|
||||||
|
printf("Abort: PANPHASIA_init_level_ :error code %d\n", error);
|
||||||
|
abort();
|
||||||
|
};
|
||||||
|
|
||||||
|
//======================= FFTW ==============================
|
||||||
|
|
||||||
|
ptrdiff_t alloc_local, local_n0, local_0_start;
|
||||||
|
|
||||||
|
size_t N0 = fdim * (PANPHASIA2::descriptor_base_size << rel_level);
|
||||||
|
|
||||||
|
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0, N0, N0 + 2, MPI_COMM_WORLD, &local_n0, &local_0_start);
|
||||||
|
|
||||||
|
Grid_FFT<real_t> pan_grid({{N0, N0, N0}}, {{boxlength_, boxlength_, boxlength_}});
|
||||||
|
|
||||||
|
_unused(alloc_local);
|
||||||
|
|
||||||
|
assert(pan_grid.n_[0] == N0);
|
||||||
|
assert(pan_grid.n_[1] == N0);
|
||||||
|
assert(pan_grid.n_[2] == N0);
|
||||||
|
assert(pan_grid.local_0_start_ == local_0_start);
|
||||||
|
assert(pan_grid.local_0_size_ == local_n0);
|
||||||
|
assert(pan_grid.ntot_ == size_t(alloc_local));
|
||||||
|
|
||||||
|
pan_grid.FourierTransformForward(false);
|
||||||
|
|
||||||
|
FFTW_COMPLEX *Panphasia_White_Noise_Field = reinterpret_cast<FFTW_COMPLEX *>(&pan_grid.data_[0]);
|
||||||
|
|
||||||
|
// Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
|
||||||
|
|
||||||
|
if ((error = PANPHASIA2::PANPHASIA_compute_kspace_field_(rel_level, N0, local_n0, local_0_start, Panphasia_White_Noise_Field)))
|
||||||
|
{
|
||||||
|
music::elog << "Error code from PANPHASIA_compute ... (ErrCode = " << error << ")" << std::endl;
|
||||||
|
};
|
||||||
|
|
||||||
|
pan_grid.FourierInterpolateCopyTo(g);
|
||||||
|
}
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
RNG_plugin_creator_concrete<RNG_panphasia> creator("PANPHASIA");
|
RNG_plugin_creator_concrete<RNG_panphasia> creator("PANPHASIA");
|
||||||
|
|
Loading…
Reference in a new issue