mirror of
https://github.com/cosmo-sims/monofonIC.git
synced 2024-09-19 17:03:45 +02:00
first commit PANPHASIA_HO
This commit is contained in:
parent
1ef2b417e3
commit
b567ba8e68
25 changed files with 8951 additions and 0 deletions
44
external/panphasia_ho/PAN_FFTW3.h
vendored
Normal file
44
external/panphasia_ho/PAN_FFTW3.h
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
// Define macros for FFTW3 to allow swapping
|
||||
// between single/double precision FTs
|
||||
|
||||
#define FOURIER_DOUBLE
|
||||
|
||||
#ifdef FOURIER_DOUBLE
|
||||
#define FFTW_REAL double
|
||||
#define FFTW_PLAN fftw_plan
|
||||
#define FFTW_DESTROY_PLAN fftw_destroy_plan
|
||||
#define FFTW_COMPLEX fftw_complex
|
||||
#define FFTW_MALLOC fftw_malloc
|
||||
#define FFTW_PLAN_DFT_1D fftw_plan_dft_1d
|
||||
#define FFTW_PLAN_dft_3D fftw_plan_dft_3d
|
||||
#define FFTW_EXECUTE fftw_execute
|
||||
#define FFTW_DESTROY_PLAN fftw_destroy_plan
|
||||
#define FFTW_FREE fftw_free
|
||||
#define FFTW_ALLOC_COMPLEX fftw_alloc_complex
|
||||
#define FFTW_MPI_LOCAL_SIZE_MANY fftw_mpi_local_size_many
|
||||
#define FFTW_PLAN_MANY_DFT fftw_plan_many_dft
|
||||
#define FFTW_MPI_LOCAL_SIZE_3D fftw_mpi_local_size_3d
|
||||
#define FFTW_MPI_PLAN_MANY_DTF fftw_mpi_plan_many_dft
|
||||
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftw_mpi_plan_many_dft_r2c
|
||||
#define FFTW_MPI_EXECUTE_DFT fftw_mpi_execute_dft
|
||||
#define FFTW_MPI_EXECUTE_DFT_R2C fftw_mpi_execute_dft_r2c
|
||||
#else
|
||||
#define FFTW_REAL float
|
||||
#define FFTW_PLAN fftwf_plan
|
||||
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
|
||||
#define FFTW_COMPLEX fftwf_complex
|
||||
#define FFTW_MALLOC fftwf_malloc
|
||||
#define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d
|
||||
#define FFTW_PLAN_dft_3D fftwf_plan_dft_3d
|
||||
#define FFTW_EXECUTE fftwf_execute
|
||||
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
|
||||
#define FFTW_FREE fftwf_free
|
||||
#define FFTW_ALLOC_COMPLEX fftwf_alloc_complex
|
||||
#define FFTW_MPI_LOCAL_SIZE_MANY fftwf_mpi_local_size_many
|
||||
#define FFTW_PLAN_MANY_DFT fftwf_plan_many_dft
|
||||
#define FFTW_MPI_LOCAL_SIZE_3D fftwf_mpi_local_size_3d
|
||||
#define FFTW_MPI_PLAN_MANY_DTF fftwf_mpi_plan_many_dft
|
||||
#define FFTW_MPI_PLAN_MANY_DTF_R2C fftwf_mpi_plan_many_dft_r2c
|
||||
#define FFTW_MPI_EXECUTE_DFT fftwf_mpi_execute_dft
|
||||
#define FFTW_MPI_EXECUTE_DFT_R2C fftwf_mpi_execute_dft_r2c
|
||||
#endif
|
143
external/panphasia_ho/README
vendored
Normal file
143
external/panphasia_ho/README
vendored
Normal file
|
@ -0,0 +1,143 @@
|
|||
|
||||
|
||||
modules on COSMA7
|
||||
|
||||
intel_comp/2018 fftw/3.3.9cosma7
|
||||
intel_mpi/2018 gsl/2.5
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
The code calls a function to generate the k-space modes for
|
||||
a portion of the Panphasia field given an input descriptor.
|
||||
|
||||
|
||||
|
||||
|
||||
Should be called early before significant memory is allocated. It
|
||||
uses quite a bit of memory itself, but tidies up afterwards.
|
||||
|
||||
Has OpenMP - -DUSE_OPENMP in the makefile
|
||||
|
||||
|
||||
|
||||
The routines support both single and double precision
|
||||
calculations in two senses.
|
||||
|
||||
The Fourier computations can be single or double precision
|
||||
|
||||
MACROs FFTW_REAL/FFTW_COMPLEX used to define 'Fourier' precision types
|
||||
float or double.
|
||||
|
||||
The Panphasia coefficients can be single or double precision
|
||||
|
||||
MACROs PAN_REAL/PAN_COMPLEX define the Panphasia precision - either
|
||||
float or double.
|
||||
|
||||
|
||||
To change the Fourier precision edit PAN_FFTW3.h - by default
|
||||
single precision unless 'FOURIER_DOUBLE' is defined.
|
||||
|
||||
To change the Panphasia precision edit panphasia_functions.h and
|
||||
single precision unless 'PAN_DOUBLE_PRECISION' is defined.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Code description
|
||||
-------------------
|
||||
|
||||
makefile
|
||||
|
||||
CODE
|
||||
----
|
||||
|
||||
|
||||
main.c - demo program only
|
||||
|
||||
pan_mpi_routines.c - contains MPI calls
|
||||
|
||||
|
||||
|
||||
high_order_panphasia_routines.c - serial - contains some OpenMP
|
||||
uniform_rand_threefry4x64.c - serial - random generator and tests
|
||||
|
||||
Include files
|
||||
--------------
|
||||
|
||||
panphasia_functions.h
|
||||
|
||||
PAN_FFTW3.h - MACROS for single/double precision FTs
|
||||
|
||||
pan_matrices_order6.h - matrix coefficients for 6th order scheme.
|
||||
|
||||
|
||||
threefry.h - Random generator
|
||||
array.h + features array .h files
|
||||
|
||||
|
||||
|
||||
|
||||
Development notes:
|
||||
|
||||
----------------------------------------------------
|
||||
|
||||
14th April 2021
|
||||
|
||||
|
||||
|
||||
Found a bug in the OpenMP version. Different numbers
|
||||
of threads led to a subset of Fourier modes having
|
||||
different values. The precise differences changed
|
||||
each time the code was run.
|
||||
|
||||
Debugged by turning of OpenMP section by section.
|
||||
The section which uses the spherical bessel functions
|
||||
turned out to be responsible.
|
||||
|
||||
The faulty version collapsed for 4 loops over
|
||||
multipole,x,y,z. Changing this to a loop
|
||||
over multipoles, and collapsing 3 coordinate
|
||||
loops solved the problem.
|
||||
|
||||
The variable index1 of the return field
|
||||
does not depend on the multipole, while
|
||||
index2 does. Both index1 and index2 are
|
||||
private. This means the return array
|
||||
(index 1) is updated several times.
|
||||
Presumably as these updated occur
|
||||
in parallel with the 4 loop collapsed
|
||||
version the return array was being
|
||||
corrupted sometimes.
|
||||
|
||||
|
||||
15th April
|
||||
------------
|
||||
|
||||
This version supercedes version given to
|
||||
Oliver to add to MonofonIC clone.
|
||||
|
||||
Main difference is additional OpenMP
|
||||
statements and the ability to specify
|
||||
in the descriptor that modes less
|
||||
than of equal to some dimensionless
|
||||
integer wavenumber squared are set
|
||||
to the mean power.
|
||||
|
||||
Tested output on 1 core - with/without
|
||||
OpenMP. Not tested with more than
|
||||
1 MPI rank.
|
||||
|
||||
|
||||
|
||||
|
||||
|
326
external/panphasia_ho/array.h
vendored
Normal file
326
external/panphasia_ho/array.h
vendored
Normal file
|
@ -0,0 +1,326 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef _r123array_dot_h__
|
||||
#define _r123array_dot_h__
|
||||
#include "features/compilerfeatures.h"
|
||||
#include "features/sse.h"
|
||||
|
||||
#ifndef __cplusplus
|
||||
#define CXXMETHODS(_N, W, T)
|
||||
#define CXXOVERLOADS(_N, W, T)
|
||||
#else
|
||||
|
||||
#include <stddef.h>
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
|
||||
/** @defgroup arrayNxW The r123arrayNxW classes
|
||||
|
||||
Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
|
||||
It is functionally equivalent to the C++0x std::array<N, uintW_t>,
|
||||
but does not require C++0x features or libraries.
|
||||
|
||||
In addition to meeting most of the requirements of a Container,
|
||||
it also has a member function, incr(), which increments the zero-th
|
||||
element and carrys overflows into higher indexed elements. Thus,
|
||||
by using incr(), sequences of up to 2^(N*W) distinct values
|
||||
can be produced.
|
||||
|
||||
If SSE is supported by the compiler, then the class
|
||||
r123array1xm128i is also defined, in which the data member is an
|
||||
array of one r123128i object.
|
||||
|
||||
@cond HIDDEN_FROM_DOXYGEN
|
||||
*/
|
||||
|
||||
template <typename value_type>
|
||||
inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
|
||||
value_type v=0;
|
||||
for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
|
||||
v |= ((value_type)(*p32++)) << (32*i);
|
||||
return v;
|
||||
}
|
||||
|
||||
// Work-alike methods and typedefs modeled on std::array:
|
||||
#define CXXMETHODS(_N, W, T) \
|
||||
typedef T value_type; \
|
||||
typedef T* iterator; \
|
||||
typedef const T* const_iterator; \
|
||||
typedef value_type& reference; \
|
||||
typedef const value_type& const_reference; \
|
||||
typedef size_t size_type; \
|
||||
typedef ptrdiff_t difference_type; \
|
||||
typedef T* pointer; \
|
||||
typedef const T* const_pointer; \
|
||||
typedef std::reverse_iterator<iterator> reverse_iterator; \
|
||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
|
||||
/* Boost.array has static_size. C++11 specializes tuple_size */ \
|
||||
enum {static_size = _N}; \
|
||||
R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
|
||||
R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
|
||||
R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
|
||||
R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
|
||||
R123_CUDA_DEVICE size_type size() const { return _N; } \
|
||||
R123_CUDA_DEVICE size_type max_size() const { return _N; } \
|
||||
R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
|
||||
R123_CUDA_DEVICE iterator begin() { return &v[0]; } \
|
||||
R123_CUDA_DEVICE iterator end() { return &v[_N]; } \
|
||||
R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \
|
||||
R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \
|
||||
R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \
|
||||
R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \
|
||||
R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \
|
||||
R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
|
||||
R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \
|
||||
R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
|
||||
R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
|
||||
R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
|
||||
R123_CUDA_DEVICE pointer data(){ return &v[0]; } \
|
||||
R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \
|
||||
R123_CUDA_DEVICE reference front(){ return v[0]; } \
|
||||
R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \
|
||||
R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \
|
||||
R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \
|
||||
R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
|
||||
/* CUDA3 does not have std::equal */ \
|
||||
for (size_t i = 0; i < _N; ++i) \
|
||||
if (v[i] != rhs.v[i]) return false; \
|
||||
return true; \
|
||||
} \
|
||||
R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
|
||||
/* CUDA3 does not have std::fill_n */ \
|
||||
R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
|
||||
R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
|
||||
/* CUDA3 does not have std::swap_ranges */ \
|
||||
for (size_t i = 0; i < _N; ++i) { \
|
||||
T tmp = v[i]; \
|
||||
v[i] = rhs.v[i]; \
|
||||
rhs.v[i] = tmp; \
|
||||
} \
|
||||
} \
|
||||
R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \
|
||||
/* This test is tricky because we're trying to avoid spurious \
|
||||
complaints about illegal shifts, yet still be compile-time \
|
||||
evaulated. */ \
|
||||
if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
|
||||
return incr_carefully(n); \
|
||||
if(n==1){ \
|
||||
++v[0]; \
|
||||
if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \
|
||||
}else{ \
|
||||
v[0] += n; \
|
||||
if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \
|
||||
} \
|
||||
/* We expect that the N==?? tests will be \
|
||||
constant-folded/optimized away by the compiler, so only the \
|
||||
overflow tests (!!v[i]) remain to be done at runtime. For \
|
||||
small values of N, it would be better to do this as an \
|
||||
uncondtional sequence of adc. An experiment/optimization \
|
||||
for another day... \
|
||||
N.B. The weird subscripting: v[_N>3?3:0] is to silence \
|
||||
a spurious error from icpc \
|
||||
*/ \
|
||||
++v[_N>1?1:0]; \
|
||||
if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
|
||||
++v[_N>2?2:0]; \
|
||||
if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \
|
||||
++v[_N>3?3:0]; \
|
||||
for(size_t i=4; i<_N; ++i){ \
|
||||
if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \
|
||||
++v[i]; \
|
||||
} \
|
||||
return *this; \
|
||||
} \
|
||||
/* seed(SeedSeq) would be a constructor if having a constructor */ \
|
||||
/* didn't cause headaches with defaults */ \
|
||||
template <typename SeedSeq> \
|
||||
R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \
|
||||
r123array##_N##x##W ret; \
|
||||
const size_t Ngen = _N*((3+sizeof(value_type))/4); \
|
||||
uint32_t u32[Ngen]; \
|
||||
uint32_t *p32 = &u32[0]; \
|
||||
ss.generate(&u32[0], &u32[Ngen]); \
|
||||
for(size_t i=0; i<_N; ++i){ \
|
||||
ret.v[i] = assemble_from_u32<value_type>(p32); \
|
||||
p32 += (3+sizeof(value_type))/4; \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
protected: \
|
||||
R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
|
||||
/* n may be greater than the maximum value of a single value_type */ \
|
||||
value_type vtn; \
|
||||
vtn = n; \
|
||||
v[0] += n; \
|
||||
const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
|
||||
for(size_t i=1; i<_N; ++i){ \
|
||||
if(rshift){ \
|
||||
n >>= rshift; \
|
||||
}else{ \
|
||||
n=0; \
|
||||
} \
|
||||
if( v[i-1] < vtn ) \
|
||||
++n; \
|
||||
if( n==0 ) break; \
|
||||
vtn = n; \
|
||||
v[i] += n; \
|
||||
} \
|
||||
return *this; \
|
||||
} \
|
||||
|
||||
|
||||
// There are several tricky considerations for the insertion and extraction
|
||||
// operators:
|
||||
// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
|
||||
// not as 16 bytes.
|
||||
// - we would like to be able to print r123array1xm128i.
|
||||
// - we do not want an int conversion operator in r123m128i because it causes
|
||||
// lots of ambiguity problems with automatic promotions.
|
||||
// Solution: r123arrayinsertable and r123arrayextractable
|
||||
|
||||
template<typename T>
|
||||
struct r123arrayinsertable{
|
||||
const T& v;
|
||||
r123arrayinsertable(const T& t_) : v(t_) {}
|
||||
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
|
||||
return os << t.v;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct r123arrayinsertable<uint8_t>{
|
||||
const uint8_t& v;
|
||||
r123arrayinsertable(const uint8_t& t_) : v(t_) {}
|
||||
friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
|
||||
return os << (int)t.v;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct r123arrayextractable{
|
||||
T& v;
|
||||
r123arrayextractable(T& t_) : v(t_) {}
|
||||
friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
|
||||
return is >> t.v;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct r123arrayextractable<uint8_t>{
|
||||
uint8_t& v;
|
||||
r123arrayextractable(uint8_t& t_) : v(t_) {}
|
||||
friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
|
||||
int i;
|
||||
is >> i;
|
||||
t.v = i;
|
||||
return is;
|
||||
}
|
||||
};
|
||||
|
||||
#define CXXOVERLOADS(_N, W, T) \
|
||||
\
|
||||
inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \
|
||||
os << r123arrayinsertable<T>(a.v[0]); \
|
||||
for(size_t i=1; i<_N; ++i) \
|
||||
os << " " << r123arrayinsertable<T>(a.v[i]); \
|
||||
return os; \
|
||||
} \
|
||||
\
|
||||
inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \
|
||||
for(size_t i=0; i<_N; ++i){ \
|
||||
r123arrayextractable<T> x(a.v[i]); \
|
||||
is >> x; \
|
||||
} \
|
||||
return is; \
|
||||
} \
|
||||
\
|
||||
namespace r123{ \
|
||||
typedef r123array##_N##x##W Array##_N##x##W; \
|
||||
}
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
|
||||
|
||||
In C, it's nothing more than a struct containing an array of N
|
||||
objects of type T.
|
||||
|
||||
In C++ it's the same, but endowed with an assortment of member
|
||||
functions, typedefs and friends. In C++, r123arrayNxW looks a lot
|
||||
like std::array<T,N>, has most of the capabilities of a container,
|
||||
and satisfies the requirements outlined in compat/Engine.hpp for
|
||||
counter and key types. ArrayNxW, in the r123 namespace is
|
||||
a typedef equivalent to r123arrayNxW.
|
||||
*/
|
||||
|
||||
#define _r123array_tpl(_N, W, T) \
|
||||
/** @ingroup arrayNxW */ \
|
||||
/** @see arrayNxW */ \
|
||||
struct r123array##_N##x##W{ \
|
||||
T v[_N]; \
|
||||
CXXMETHODS(_N, W, T) \
|
||||
}; \
|
||||
\
|
||||
CXXOVERLOADS(_N, W, T)
|
||||
|
||||
/** @endcond */
|
||||
|
||||
_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */
|
||||
_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
|
||||
_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
|
||||
_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
|
||||
|
||||
_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
|
||||
_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
|
||||
_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
|
||||
|
||||
_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
|
||||
|
||||
#if R123_USE_SSE
|
||||
_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
|
||||
#endif
|
||||
|
||||
/* In C++, it's natural to use sizeof(a::value_type), but in C it's
|
||||
pretty convoluted to figure out the width of the value_type of an
|
||||
r123arrayNxW:
|
||||
*/
|
||||
#define R123_W(a) (8*sizeof(((a *)0)->v[0]))
|
||||
|
||||
/** @namespace r123
|
||||
Most of the Random123 C++ API is contained in the r123 namespace.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
33
external/panphasia_ho/c7_script_threaded
vendored
Executable file
33
external/panphasia_ho/c7_script_threaded
vendored
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/bin/bash -l
|
||||
|
||||
#SBATCH --ntasks 5
|
||||
#SBATCH -J Test_MPI_FFTW
|
||||
#SBATCH -o standard_output_file.%J.out
|
||||
#SBATCH -e standard_error_file.%J.err
|
||||
#SBATCH -p cosma7
|
||||
#SBATCH -A dp004
|
||||
#SBATCH --exclusive
|
||||
#SBATCH -t 00:05:00
|
||||
#SBATCH --mail-type=END # notifications for job
|
||||
#SBATCH --mail-user=a.r.jenkins@durham.ac.uk
|
||||
|
||||
module purge
|
||||
module load intel_comp/2018 intel_mpi/2018 fftw/3.3.9cosma7 gsl/2.5 hdf5/1.8.20
|
||||
|
||||
|
||||
# Run the program
|
||||
|
||||
|
||||
|
||||
mpirun -l -env I_MPI_PIN=1 -env I_MPI_PIN_PROCESSOR_LIST=allcores -n $SLURM_NTASKS ./pan_fftw3_test_code.x
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
93
external/panphasia_ho/features/clangfeatures.h
vendored
Normal file
93
external/panphasia_ho/features/clangfeatures.h
vendored
Normal file
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
Copyright 2010-2016, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __clangfeatures_dot_hpp
|
||||
#define __clangfeatures_dot_hpp
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#if (defined(__x86_64__)||defined(__i386__))
|
||||
#define R123_USE_X86INTRIN_H 1
|
||||
#else
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||
#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||
#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
|
||||
#endif
|
||||
|
||||
// With clang-3.6, -Wall warns about unused-local-typedefs.
|
||||
// The "obvious" thing to do is to ignore -Wunused-local-typedefs,
|
||||
// but that doesn't work because earlier versions of clang blow
|
||||
// up on an 'unknown warning group'. So we briefly ignore -Wall...
|
||||
// It's tempting to just give up on static assertions in pre-c++11 code.
|
||||
#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT)
|
||||
#define R123_STATIC_ASSERT(expr, msg) \
|
||||
_Pragma("clang diagnostic push") \
|
||||
_Pragma("clang diagnostic ignored \"-Wall\"") \
|
||||
typedef char static_assertion[(!!(expr))*2-1] \
|
||||
_Pragma("clang diagnostic pop")
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||
#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
|
||||
#endif
|
||||
|
||||
// With clang-3.0, the apparently simpler:
|
||||
// #define R123_USE_CXX11_RANDOM __has_include(<random>)
|
||||
// dumps core.
|
||||
#ifndef R123_USE_CXX11_RANDOM
|
||||
#if __cplusplus>=201103L && __has_include(<random>)
|
||||
#define R123_USE_CXX11_RANDOM 1
|
||||
#else
|
||||
#define R123_USE_CXX11_RANDOM 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||
#if __cplusplus>=201103L && __has_include(<type_traits>)
|
||||
#define R123_USE_CXX11_TYPE_TRAITS 1
|
||||
#else
|
||||
#define R123_USE_CXX11_TYPE_TRAITS 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "gccfeatures.h"
|
||||
|
||||
#endif
|
343
external/panphasia_ho/features/compilerfeatures.h
vendored
Normal file
343
external/panphasia_ho/features/compilerfeatures.h
vendored
Normal file
|
@ -0,0 +1,343 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
|
||||
@page porting Preprocessor symbols for porting Random123 to different platforms.
|
||||
|
||||
The Random123 library is portable across C, C++, CUDA, OpenCL environments,
|
||||
and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
|
||||
This level of portability requires the abstraction of some features
|
||||
and idioms that are either not standardized (e.g., asm statments), or for which
|
||||
different vendors have their own standards (e.g., SSE intrinsics) or for
|
||||
which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
|
||||
|
||||
Random123/features/compilerfeatures.h
|
||||
conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
|
||||
defines appropriate values for the preprocessor symbols which can be used with
|
||||
a specific compiler or OS. Those symbols will then
|
||||
be used by other header files and source files in the Random123
|
||||
library (and may be used by applications) to control what actually
|
||||
gets presented to the compiler.
|
||||
|
||||
Most of the symbols are boolean valued. In general, they will
|
||||
\b always be defined with value either 1 or 0, so do
|
||||
\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead.
|
||||
|
||||
Library users can override any value by defining the pp-symbol with a compiler option,
|
||||
e.g.,
|
||||
|
||||
cc -DR123_USE_MULHILO64_C99
|
||||
|
||||
will use a strictly c99 version of the full-width 64x64->128-bit multiplication
|
||||
function, even if it would be disabled by default.
|
||||
|
||||
All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
|
||||
@verbatim
|
||||
AES_NI
|
||||
AES_OPENSSL
|
||||
SSE4_2
|
||||
SSE4_1
|
||||
SSE
|
||||
|
||||
STD_RANDOM
|
||||
|
||||
GNU_UINT128
|
||||
ASM_GNU
|
||||
ASM_MSASM
|
||||
|
||||
CPUID_MSVC
|
||||
|
||||
CXX11_RANDOM
|
||||
CXX11_TYPE_TRAITS
|
||||
CXX11_STATIC_ASSERT
|
||||
CXX11_CONSTEXPR
|
||||
CXX11_UNRESTRICTED_UNIONS
|
||||
CXX11_EXPLICIT_CONVERSIONS
|
||||
CXX11_LONG_LONG
|
||||
CXX11_STD_ARRAY
|
||||
CXX11
|
||||
|
||||
X86INTRIN_H
|
||||
IA32INTRIN_H
|
||||
XMMINTRIN_H
|
||||
EMMINTRIN_H
|
||||
SMMINTRIN_H
|
||||
WMMINTRIN_H
|
||||
INTRIN_H
|
||||
|
||||
MULHILO32_ASM
|
||||
MULHILO64_ASM
|
||||
MULHILO64_MSVC_INTRIN
|
||||
MULHILO64_CUDA_INTRIN
|
||||
MULHILO64_OPENCL_INTRIN
|
||||
MULHILO64_C99
|
||||
|
||||
U01_DOUBLE
|
||||
|
||||
@endverbatim
|
||||
Most have obvious meanings. Some non-obvious ones:
|
||||
|
||||
AES_NI and AES_OPENSSL are not mutually exclusive. You can have one,
|
||||
both or neither.
|
||||
|
||||
GNU_UINT128 says that it's safe to use __uint128_t, but it
|
||||
does not require its use. In particular, it should be
|
||||
used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
|
||||
|
||||
If the XXXINTRIN_H macros are true, then one should
|
||||
@code
|
||||
#include <xxxintrin.h>
|
||||
@endcode
|
||||
to gain accesss to compiler intrinsics.
|
||||
|
||||
The CXX11_SOME_FEATURE macros allow the code to use specific
|
||||
features of the C++11 language and library. The catchall
|
||||
In the absence of a specific CXX11_SOME_FEATURE, the feature
|
||||
is controlled by the catch-all R123_USE_CXX11 macro.
|
||||
|
||||
U01_DOUBLE defaults on, and can be turned off (set to 0)
|
||||
if one does not want the utility functions that convert to double
|
||||
(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
|
||||
|
||||
There are a number of invariants that are always true. Application code may
|
||||
choose to rely on these:
|
||||
|
||||
<ul>
|
||||
<li>ASM_GNU and ASM_MASM are mutually exclusive
|
||||
<li>The "higher" SSE values imply the lower ones.
|
||||
</ul>
|
||||
|
||||
There are also non-boolean valued symbols:
|
||||
|
||||
<ul>
|
||||
<li>R123_STATIC_INLINE -
|
||||
According to both C99 and GNU99, the 'static inline' declaration allows
|
||||
the compiler to not emit code if the function is not used.
|
||||
Note that the semantics of 'inline', 'static' and 'extern' in
|
||||
gcc have changed over time and are subject to modification by
|
||||
command line options, e.g., -std=gnu89, -fgnu-inline.
|
||||
Nevertheless, it appears that the meaning of 'static inline'
|
||||
has not changed over time and (with a little luck) the use of 'static inline'
|
||||
here will be portable between versions of gcc and to other C99
|
||||
compilers.
|
||||
See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
|
||||
http://www.greenend.org.uk/rjk/2003/03/inline.html
|
||||
|
||||
<li>R123_FORCE_INLINE(decl) -
|
||||
which expands to 'decl', adorned with the compiler-specific
|
||||
embellishments to strongly encourage that the declared function be
|
||||
inlined. If there is no such compiler-specific magic, it should
|
||||
expand to decl, unadorned.
|
||||
|
||||
<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
|
||||
sufficiently similar semantics) when CUDA is in use, and expands
|
||||
to nothing in other cases.
|
||||
|
||||
<li>R123_METAL_THREAD_ADDRESS_SPACE - which expands to 'thread' (or
|
||||
something else with sufficiently similar semantics) when compiling a
|
||||
Metal kernel, and expands to nothing in other cases.
|
||||
|
||||
<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
|
||||
all if we're in an environment so feature-poor that you can't even
|
||||
call assert (I'm looking at you, CUDA and OpenCL), or even include
|
||||
assert.h safely (OpenCL).
|
||||
|
||||
<li>R123_STATIC_ASSERT(expr,msg) - which expands to
|
||||
static_assert(expr,msg), or to an expression that
|
||||
will raise a compile-time exception if expr is not true.
|
||||
|
||||
<li>R123_ULONG_LONG - which expands to a declaration of the longest available
|
||||
unsigned integer.
|
||||
|
||||
<li>R123_64BIT(x) - expands to something equivalent to
|
||||
UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
|
||||
is not available, e.g., MSVC and OpenCL.
|
||||
|
||||
<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
|
||||
the semantics of gcc's __builtin_expect(expr,likely_value). If
|
||||
the environment has nothing like __builtin_expect, it should expand
|
||||
to just expr.
|
||||
</ul>
|
||||
|
||||
|
||||
\cond HIDDEN_FROM_DOXYGEN
|
||||
*/
|
||||
|
||||
/*
|
||||
N.B. When something is added to the list of features, it should be
|
||||
added to each of the *features.h files, AND to examples/ut_features.cpp.
|
||||
*/
|
||||
|
||||
/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
|
||||
#if defined(__METAL_MACOS__)
|
||||
#include "metalfeatures.h"
|
||||
#elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
|
||||
#include "openclfeatures.h"
|
||||
#elif defined(__CUDACC__)
|
||||
#include "nvccfeatures.h"
|
||||
#elif defined(__ICC)
|
||||
#include "iccfeatures.h"
|
||||
#elif defined(__xlC__) || defined(__ibmxl__)
|
||||
#include "xlcfeatures.h"
|
||||
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
|
||||
#include "sunprofeatures.h"
|
||||
#elif defined(__OPEN64__)
|
||||
#include "open64features.h"
|
||||
#elif defined(__clang__)
|
||||
#include "clangfeatures.h"
|
||||
#elif defined(__GNUC__)
|
||||
#include "gccfeatures.h"
|
||||
#elif defined(__PGI)
|
||||
#include "pgccfeatures.h"
|
||||
#elif defined(_MSC_FULL_VER)
|
||||
#include "msvcfeatures.h"
|
||||
#else
|
||||
#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp"
|
||||
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11
|
||||
#define R123_USE_CXX11 (__cplusplus >= 201103L)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||
#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||
#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||
#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_RANDOM
|
||||
#define R123_USE_CXX11_RANDOM R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||
#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_LONG_LONG
|
||||
#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_STD_ARRAY
|
||||
#define R123_USE_CXX11_STD_ARRAY R123_USE_CXX11
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_C99
|
||||
#define R123_USE_MULHILO64_C99 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_ASSERT
|
||||
#if R123_USE_CXX11_STATIC_ASSERT
|
||||
#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
|
||||
#else
|
||||
/* if msg always_looked_like_this, we could paste it into the name. Worth it? */
|
||||
#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_CONSTEXPR
|
||||
#if R123_USE_CXX11_CONSTEXPR
|
||||
#define R123_CONSTEXPR constexpr
|
||||
#else
|
||||
#define R123_CONSTEXPR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_64BIT
|
||||
#define R123_USE_64BIT 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_PHILOX_64BIT
|
||||
#define R123_USE_PHILOX_64BIT (R123_USE_64BIT && (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN))
|
||||
#endif
|
||||
|
||||
#ifndef R123_ULONG_LONG
|
||||
#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
|
||||
/* C++98 doesn't have long long. It doesn't have uint64_t either, but
|
||||
we will have typedef'ed uint64_t to something in the xxxfeatures.h.
|
||||
With luck, it won't elicit complaints from -pedantic. Cross your
|
||||
fingers... */
|
||||
#define R123_ULONG_LONG uint64_t
|
||||
#else
|
||||
#define R123_ULONG_LONG unsigned long long
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* UINT64_C should have been #defined by XXXfeatures.h, either by
|
||||
#include <stdint.h> or through compiler-dependent hacks */
|
||||
#ifndef R123_64BIT
|
||||
#define R123_64BIT(x) UINT64_C(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_THROW
|
||||
#define R123_THROW(x) throw (x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
|
||||
#define R123_METAL_THREAD_ADDRESS_SPACE
|
||||
#endif
|
||||
|
||||
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||
#define R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Windows.h (and perhaps other "well-meaning" code define min and
|
||||
* max, so there's a high chance that our definition of min, max
|
||||
* methods or use of std::numeric_limits min and max will cause
|
||||
* complaints in any program that happened to include Windows.h or
|
||||
* suchlike first. We use the null macro below in our own header
|
||||
* files definition or use of min, max to defensively preclude
|
||||
* this problem. It may not be enough; one might need to #define
|
||||
* NOMINMAX before including Windows.h or compile with -DNOMINMAX.
|
||||
*/
|
||||
#define R123_NO_MACRO_SUBST
|
||||
|
||||
/** \endcond */
|
263
external/panphasia_ho/features/gccfeatures.h
vendored
Normal file
263
external/panphasia_ho/features/gccfeatures.h
vendored
Normal file
|
@ -0,0 +1,263 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __gccfeatures_dot_hpp
|
||||
#define __gccfeatures_dot_hpp
|
||||
|
||||
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
|
||||
|
||||
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390x__)
|
||||
# error "This code has only been tested on x86, powerpc and a few arm platforms."
|
||||
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||
/* Feel free to try the Random123 library on other architectures by changing
|
||||
the conditions that reach this error, but you should consider it a
|
||||
porting exercise and expect to encounter bugs and deficiencies.
|
||||
Please let the authors know of any successes (or failures). */
|
||||
#endif
|
||||
|
||||
#ifdef __powerpc__
|
||||
#include <ppu_intrinsics.h>
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static __inline__
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#if R123_GNUC_VERSION >= 40000
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||
#else
|
||||
#define R123_FORCE_INLINE(decl) decl
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||
#endif
|
||||
|
||||
/* According to the C++0x standard, we should be able to test the numeric
|
||||
value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11
|
||||
But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
|
||||
since early 2001, which was finally fixed in 4.7 (early 2012). For
|
||||
earlier versions, the only way to detect whether --std=c++0x was requested
|
||||
on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
|
||||
*/
|
||||
#if defined(__GCC_EXPERIMENTAL_CXX0X__)
|
||||
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
|
||||
#else
|
||||
#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||
#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_STATIC_ASSERT
|
||||
#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_CONSTEXPR
|
||||
#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||
#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_RANDOM
|
||||
#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CXX11_TYPE_TRAITS
|
||||
#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#ifdef __AES__
|
||||
#define R123_USE_AES_NI 1
|
||||
#else
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#ifdef __SSE4_2__
|
||||
#define R123_USE_SSE4_2 1
|
||||
#else
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#ifdef __SSE4_1__
|
||||
#define R123_USE_SSE4_1 1
|
||||
#else
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
/* There's no point in trying to compile SSE code in Random123
|
||||
unless SSE2 is available. */
|
||||
#ifdef __SSE2__
|
||||
#define R123_USE_SSE 1
|
||||
#else
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
/* There isn't really a good way to tell at compile time whether
|
||||
openssl is available. Without a pre-compilation configure-like
|
||||
tool, it's less error-prone to guess that it isn't available. Add
|
||||
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||
play with openssl */
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
#define R123_USE_GNU_UINT128 1
|
||||
#else
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#if (defined(__x86_64__)||defined(__i386__))
|
||||
#define R123_USE_ASM_GNU 1
|
||||
#else
|
||||
#define R123_USE_ASM_GNU 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#if (defined(__x86_64__)||defined(__i386__))
|
||||
#define R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
|
||||
#else
|
||||
#define R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && R123_GNUC_VERSION >= 40402)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
|
||||
emmintrin.h in the include search path. This is
|
||||
so broken that I refuse to try to work around it. If this
|
||||
affects you, figure out where your emmintrin.h lives and
|
||||
add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */
|
||||
#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#define R123_USE_INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||
#if (defined(__powerpc64__))
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 1
|
||||
#else
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_MULHILO64_MULHI_INTRIN
|
||||
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_MULHILO32_MULHI_INTRIN
|
||||
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||
and in ../ut_features.cpp */
|
||||
#endif
|
212
external/panphasia_ho/features/iccfeatures.h
vendored
Normal file
212
external/panphasia_ho/features/iccfeatures.h
vendored
Normal file
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __icpcfeatures_dot_hpp
|
||||
#define __icpcfeatures_dot_hpp
|
||||
|
||||
// icc relies on gcc libraries and other toolchain components.
|
||||
#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
|
||||
|
||||
#if !defined(__x86_64__) && !defined(__i386__)
|
||||
# error "This code has only been tested on x86 platforms."
|
||||
{ // maybe an unbalanced brace will terminate the compilation
|
||||
// You are invited to try Easy123 on other architectures, by changing
|
||||
// the conditions that reach this error, but you should consider it a
|
||||
// porting exercise and expect to encounter bugs and deficiencies.
|
||||
// Please let the authors know of any successes (or failures).
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||
#endif
|
||||
|
||||
// The basic idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #if some condition
|
||||
// #define R123_SOMETHING 1
|
||||
// #else
|
||||
// #define R123_SOMETHING 0
|
||||
// #endif
|
||||
// #endif
|
||||
// This idiom allows an external user to override any decision
|
||||
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||
|
||||
// An alternative idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #define R123_SOMETHING (some boolean expression)
|
||||
// #endif
|
||||
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||
// pp-symbols.
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#ifdef __SSE4_2__
|
||||
#define R123_USE_SSE4_2 1
|
||||
#else
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#ifdef __SSE4_1__
|
||||
#define R123_USE_SSE4_1 1
|
||||
#else
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
#ifdef __SSE2__
|
||||
#define R123_USE_SSE 1
|
||||
#else
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
// Unlike gcc, icc (version 12) does not pre-define an __AES__
|
||||
// pp-symbol when -maes or -xHost is on the command line. This feels
|
||||
// like a defect in icc (it defines __SSE4_2__ in analogous
|
||||
// circumstances), but until Intel fixes it, we're better off erring
|
||||
// on the side of caution and not generating instructions that are
|
||||
// going to raise SIGILL when executed. To get the AES-NI
|
||||
// instructions with icc, the caller must puts something like
|
||||
// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
|
||||
// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
|
||||
// 11.1 onwards.
|
||||
//
|
||||
#if defined(__AES__)
|
||||
#define R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/)
|
||||
#else
|
||||
#define R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
/* There isn't really a good way to tell at compile time whether
|
||||
openssl is available. Without a pre-compilation configure-like
|
||||
tool, it's less error-prone to guess that it isn't available. Add
|
||||
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||
play with openssl */
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#define R123_USE_ASM_GNU 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
#define R123_USE_EMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#define R123_USE_INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO16_ASM
|
||||
#define R123_USE_MULHILO16_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
// If you add something, it must go in all the other XXfeatures.hpp
|
||||
// and in ../ut_features.cpp
|
||||
#endif
|
111
external/panphasia_ho/features/metalfeatures.h
vendored
Normal file
111
external/panphasia_ho/features/metalfeatures.h
vendored
Normal file
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Written by Tom Schoonjans <Tom.Schoonjans@me.com>
|
||||
*/
|
||||
|
||||
#ifndef __metalfeatures_dot_hpp
|
||||
#define __metalfeatures_dot_hpp
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_METAL_THREAD_ADDRESS_SPACE
|
||||
#define R123_METAL_THREAD_ADDRESS_SPACE thread
|
||||
#endif
|
||||
|
||||
#ifndef R123_METAL_CONSTANT_ADDRESS_SPACE
|
||||
#define R123_METAL_CONSTANT_ADDRESS_SPACE constant
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#define R123_ASSERT(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO32_MULHI_INTRIN 1
|
||||
#endif
|
||||
|
||||
#if R123_USE_MULHILO32_MULHI_INTRIN
|
||||
#include <metal_integer>
|
||||
#define R123_MULHILO32_MULHI_INTRIN metal::mulhi
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_64BIT
|
||||
#define R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */
|
||||
#endif
|
||||
|
||||
#ifndef R123_ULONG_LONG
|
||||
/* the longest integer type in Metal (Feb 2019, Specification-2) is a
|
||||
* 32-bit unsigned int. Let's hope for the best... */
|
||||
#define R123_ULONG_LONG unsigned int
|
||||
#endif
|
||||
|
||||
#endif
|
200
external/panphasia_ho/features/msvcfeatures.h
vendored
Normal file
200
external/panphasia_ho/features/msvcfeatures.h
vendored
Normal file
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __msvcfeatures_dot_hpp
|
||||
#define __msvcfeatures_dot_hpp
|
||||
|
||||
//#if _MSVC_FULL_VER <= 15
|
||||
//#error "We've only tested MSVC_FULL_VER==15."
|
||||
//#endif
|
||||
|
||||
#if !defined(_M_IX86) && !defined(_M_X64)
|
||||
# error "This code has only been tested on x86 platforms."
|
||||
{ // maybe an unbalanced brace will terminate the compilation
|
||||
// You are invited to try Random123 on other architectures, by changing
|
||||
// the conditions that reach this error, but you should consider it a
|
||||
// porting exercise and expect to encounter bugs and deficiencies.
|
||||
// Please let the authors know of any successes (or failures).
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static __inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) _forceinline decl
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||
#endif
|
||||
|
||||
// The basic idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #if some condition
|
||||
// #define R123_SOMETHING 1
|
||||
// #else
|
||||
// #define R123_SOMETHING 0
|
||||
// #endif
|
||||
// #endif
|
||||
// This idiom allows an external user to override any decision
|
||||
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||
|
||||
// An alternative idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #define R123_SOMETHING (some boolean expression)
|
||||
// #endif
|
||||
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||
// pp-symbols.
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#if defined(_M_X64)
|
||||
#define R123_USE_AES_NI 1
|
||||
#else
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#if defined(_M_X64)
|
||||
#define R123_USE_SSE4_2 1
|
||||
#else
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#if defined(_M_X64)
|
||||
#define R123_USE_SSE4_1 1
|
||||
#else
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
#define R123_USE_SSE 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#define R123_USE_ASM_GNU 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
#define R123_USE_EMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#define R123_USE_INTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO16_ASM
|
||||
#define R123_USE_MULHILO16_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#if defined(_M_X64)
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 1
|
||||
#else
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
#pragma warning(disable:4244)
|
||||
#pragma warning(disable:4996)
|
||||
|
||||
// If you add something, it must go in all the other XXfeatures.hpp
|
||||
// and in ../ut_features.cpp
|
||||
#endif
|
125
external/panphasia_ho/features/nvccfeatures.h
vendored
Normal file
125
external/panphasia_ho/features/nvccfeatures.h
vendored
Normal file
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __r123_nvcc_features_dot_h__
|
||||
#define __r123_nvcc_features_dot_h__
|
||||
|
||||
#if !defined(CUDART_VERSION)
|
||||
#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION < 4010
|
||||
#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
|
||||
// This test was added in Random123-1.08 (August, 2013) because we
|
||||
// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
|
||||
// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
|
||||
// earlier. We can't be sure this bug doesn't also affect invocations
|
||||
// of other templated functions, e.g., essentially all of Random123.
|
||||
// Thus, we no longer trust CUDA versions earlier than 4.1 even though
|
||||
// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
|
||||
// If you feel lucky or desperate, you can change #error to #warning, but
|
||||
// please take extra care to be sure that you are getting correct
|
||||
// results.
|
||||
#endif
|
||||
|
||||
// nvcc falls through to gcc or msvc. So first define
|
||||
// a couple of things and then include either gccfeatures.h
|
||||
// or msvcfeatures.h
|
||||
|
||||
//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
|
||||
//for both device and host functions in CUDA by setting compiler flags
|
||||
//for the device function
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE __device__
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_THROW
|
||||
// No exceptions in CUDA, at least upto 4.0
|
||||
#define R123_THROW(x) R123_ASSERT(0)
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#define R123_ASSERT(x) if((x)) ; else asm("trap;")
|
||||
#endif
|
||||
|
||||
#else // ! __CUDA_ARCH__
|
||||
// If we're using nvcc not compiling for the CUDA architecture,
|
||||
// then we must be compiling for the host. In that case,
|
||||
// tell the philox code to use the mulhilo64 asm because
|
||||
// nvcc doesn't grok uint128_t.
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 1
|
||||
#endif
|
||||
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_ULONG_LONG
|
||||
// uint64_t, which is what we'd get without this, is
|
||||
// not the same as unsigned long long
|
||||
#define R123_ULONG_LONG unsigned long long
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#include "gccfeatures.h"
|
||||
#elif defined(_MSC_FULL_VER)
|
||||
#include "msvcfeatures.h"
|
||||
#endif
|
||||
|
||||
#endif
|
50
external/panphasia_ho/features/open64features.h
vendored
Normal file
50
external/panphasia_ho/features/open64features.h
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __open64features_dot_hpp
|
||||
#define __open64features_dot_hpp
|
||||
|
||||
/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
|
||||
|
||||
/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
|
||||
but produce incorrect code for 64-bit philox. The MULHILO64_ASM
|
||||
seems to work fine */
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 1
|
||||
#endif
|
||||
|
||||
#include "gccfeatures.h"
|
||||
|
||||
#endif
|
89
external/panphasia_ho/features/openclfeatures.h
vendored
Normal file
89
external/panphasia_ho/features/openclfeatures.h
vendored
Normal file
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __openclfeatures_dot_hpp
|
||||
#define __openclfeatures_dot_hpp
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#define R123_ASSERT(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
|
||||
// ulong to mul_hi. And gets lots of complaints from stdint.h
|
||||
// on some machines.
|
||||
// But these typedefs mean we cannot include stdint.h with
|
||||
// these headers? Do we need R123_64T, R123_32T, R123_8T?
|
||||
typedef ulong uint64_t;
|
||||
typedef uint uint32_t;
|
||||
typedef uchar uint8_t;
|
||||
#define UINT64_C(x) ((ulong)(x##UL))
|
||||
|
||||
#endif
|
194
external/panphasia_ho/features/pgccfeatures.h
vendored
Normal file
194
external/panphasia_ho/features/pgccfeatures.h
vendored
Normal file
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Copyright (c) 2013, Los Alamos National Security, LLC
|
||||
All rights reserved.
|
||||
|
||||
Copyright 2013. Los Alamos National Security, LLC. This software was produced
|
||||
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
|
||||
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
|
||||
the U.S. Department of Energy. The U.S. Government has rights to use,
|
||||
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
|
||||
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
|
||||
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
|
||||
to produce derivative works, such modified software should be clearly marked,
|
||||
so as not to confuse it with the version available from LANL.
|
||||
*/
|
||||
#ifndef __pgccfeatures_dot_hpp
|
||||
#define __pgccfeatures_dot_hpp
|
||||
|
||||
#if !defined(__x86_64__) && !defined(__i386__)
|
||||
# error "This code has only been tested on x86 platforms."
|
||||
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||
/* Feel free to try the Random123 library on other architectures by changing
|
||||
the conditions that reach this error, but you should consider it a
|
||||
porting exercise and expect to encounter bugs and deficiencies.
|
||||
Please let the authors know of any successes (or failures). */
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static inline
|
||||
#endif
|
||||
|
||||
/* Found this example in PGI's emmintrin.h. */
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) (expr)
|
||||
#endif
|
||||
|
||||
/* PGI through 13.2 doesn't appear to support AES-NI. */
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
|
||||
ABM, but not SSE4.1 or SSE4.2. */
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
/* There's no point in trying to compile SSE code in Random123
|
||||
unless SSE2 is available. */
|
||||
#ifdef __SSE2__
|
||||
#define R123_USE_SSE 1
|
||||
#else
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
/* There isn't really a good way to tell at compile time whether
|
||||
openssl is available. Without a pre-compilation configure-like
|
||||
tool, it's less error-prone to guess that it isn't available. Add
|
||||
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||
play with openssl */
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#define R123_USE_ASM_GNU 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 0
|
||||
#endif
|
||||
|
||||
/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
|
||||
about undefined references to _mm_castsi128_ps(__m128i). Why? */
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
#define R123_USE_EMMINTRIN_H 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#ifdef __ABM__
|
||||
#define R123_USE_INTRIN_H 1
|
||||
#else
|
||||
#define R123_USE_INTRIN_H 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||
and in ../ut_features.cpp */
|
||||
#endif
|
280
external/panphasia_ho/features/sse.h
vendored
Normal file
280
external/panphasia_ho/features/sse.h
vendored
Normal file
|
@ -0,0 +1,280 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef _Random123_sse_dot_h__
|
||||
#define _Random123_sse_dot_h__
|
||||
|
||||
#if R123_USE_SSE
|
||||
|
||||
#if R123_USE_X86INTRIN_H
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#if R123_USE_IA32INTRIN_H
|
||||
#include <ia32intrin.h>
|
||||
#endif
|
||||
#if R123_USE_XMMINTRIN_H
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
#if R123_USE_EMMINTRIN_H
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#if R123_USE_SMMINTRIN_H
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#if R123_USE_WMMINTRIN_H
|
||||
#include <wmmintrin.h>
|
||||
#endif
|
||||
#if R123_USE_INTRIN_H
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
#endif
|
||||
|
||||
#if R123_USE_ASM_GNU
|
||||
|
||||
/* bit25 of CX tells us whether AES is enabled. */
|
||||
R123_STATIC_INLINE int haveAESNI(){
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
__asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
|
||||
"a" (1));
|
||||
return (ecx>>25) & 1;
|
||||
}
|
||||
#elif R123_USE_CPUID_MSVC
|
||||
R123_STATIC_INLINE int haveAESNI(){
|
||||
int CPUInfo[4];
|
||||
__cpuid(CPUInfo, 1);
|
||||
return (CPUInfo[2]>>25)&1;
|
||||
}
|
||||
#else /* R123_USE_CPUID_??? */
|
||||
#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
|
||||
R123_STATIC_INLINE int haveAESNI(){
|
||||
return 0;
|
||||
}
|
||||
#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
|
||||
|
||||
// There is a lot of annoying and inexplicable variation in the
|
||||
// SSE intrinsics available in different compilation environments.
|
||||
// The details seem to depend on the compiler, the version and
|
||||
// the target architecture. Rather than insisting on
|
||||
// R123_USE_feature tests for each of these in each of the
|
||||
// compilerfeatures.h files we just keep the complexity localized
|
||||
// to here...
|
||||
#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
|
||||
/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
|
||||
If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
|
||||
added _mm_set_epi64x to icc version 12.1 in Jan 2012.
|
||||
*/
|
||||
R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
|
||||
union{
|
||||
uint64_t u64;
|
||||
uint32_t u32[2];
|
||||
} u1, u0;
|
||||
u1.u64 = v1;
|
||||
u0.u64 = v0;
|
||||
return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
|
||||
}
|
||||
#endif
|
||||
/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
|
||||
word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
|
||||
on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
|
||||
assertions in ut_M128.cpp and ut_carray.cpp when we use the
|
||||
_mm_cvtsi128_si64 intrinsic. (See
|
||||
https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
|
||||
On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
|
||||
Finally, even if the intrinsic exists, it may be spelled with or
|
||||
without the 'x'.
|
||||
*/
|
||||
#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
|
||||
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||
union{
|
||||
uint64_t u64[2];
|
||||
__m128i m;
|
||||
}u;
|
||||
_mm_store_si128(&u.m, si);
|
||||
return u.u64[0];
|
||||
}
|
||||
#elif defined(__llvm__) || defined(__ICC)
|
||||
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||
return (uint64_t)_mm_cvtsi128_si64(si);
|
||||
}
|
||||
#else /* GNUC, others */
|
||||
/* FWIW, gcc's emmintrin.h has had the 'x' spelling
|
||||
since at least gcc-3.4.4. The no-'x' spelling showed up
|
||||
around 4.2. */
|
||||
R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
|
||||
return (uint64_t)_mm_cvtsi128_si64x(si);
|
||||
}
|
||||
#endif
|
||||
#if defined(__GNUC__) && __GNUC__ < 4
|
||||
/* the cast builtins showed up in gcc4. */
|
||||
R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
|
||||
return (__m128)si;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
struct r123m128i{
|
||||
__m128i m;
|
||||
#if R123_USE_CXX11_UNRESTRICTED_UNIONS
|
||||
// C++98 forbids a union member from having *any* constructors.
|
||||
// C++11 relaxes this, and allows union members to have constructors
|
||||
// as long as there is a "trivial" default construtor. So in C++11
|
||||
// we can provide a r123m128i constructor with an __m128i argument, and still
|
||||
// have the default (and hence trivial) default constructor.
|
||||
r123m128i() = default;
|
||||
r123m128i(__m128i _m): m(_m){}
|
||||
#endif
|
||||
r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
|
||||
r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
|
||||
#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
|
||||
// With C++11 we can attach explicit to the bool conversion operator
|
||||
// to disambiguate undesired promotions. For g++, this works
|
||||
// only in 4.5 and above.
|
||||
explicit operator bool() const {return _bool();}
|
||||
#else
|
||||
// Pre-C++11, we have to do something else. Google for the "safe bool"
|
||||
// idiom for other ideas...
|
||||
operator const void*() const{return _bool()?this:0;}
|
||||
#endif
|
||||
operator __m128i() const {return m;}
|
||||
|
||||
private:
|
||||
#if R123_USE_SSE4_1
|
||||
bool _bool() const{ return !_mm_testz_si128(m,m); }
|
||||
#else
|
||||
bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
|
||||
#endif
|
||||
};
|
||||
|
||||
R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
|
||||
__m128i& c = v.m;
|
||||
__m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
|
||||
c = _mm_add_epi64(c, zeroone);
|
||||
//return c;
|
||||
#if R123_USE_SSE4_1
|
||||
__m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
|
||||
if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
|
||||
__m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
|
||||
c = _mm_add_epi64(c, onezero);
|
||||
}
|
||||
#else
|
||||
unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
|
||||
// The low two bits of mask are 11 iff the low 64 bits of
|
||||
// c are zero.
|
||||
if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
|
||||
__m128i onezero = _mm_set_epi64x(1,0);
|
||||
c = _mm_add_epi64(c, onezero);
|
||||
}
|
||||
#endif
|
||||
return v;
|
||||
}
|
||||
|
||||
R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
|
||||
__m128i c = lhs.m;
|
||||
__m128i incr128 = _mm_set_epi64x(0, n);
|
||||
c = _mm_add_epi64(c, incr128);
|
||||
// return c; // NO CARRY!
|
||||
|
||||
int64_t lo64 = _mm_extract_lo64(c);
|
||||
if((uint64_t)lo64 < n)
|
||||
c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
|
||||
lhs.m = c;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
// We need this one because it's present, but never used in r123array1xm128i::incr
|
||||
R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
|
||||
throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
|
||||
|
||||
// The comparisons aren't implemented, but if we leave them out, and
|
||||
// somebody writes, e.g., M1 < M2, the compiler will do an implicit
|
||||
// conversion through void*. Sigh...
|
||||
R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
|
||||
throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
|
||||
R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
|
||||
throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
|
||||
R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
|
||||
throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
|
||||
R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
|
||||
throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
|
||||
|
||||
R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
|
||||
return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
|
||||
R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
|
||||
return !(lhs==rhs);}
|
||||
R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
|
||||
r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
|
||||
R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
|
||||
return !(lhs==rhs);}
|
||||
R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
|
||||
union{
|
||||
uint64_t u64[2];
|
||||
__m128i m;
|
||||
}u;
|
||||
_mm_storeu_si128(&u.m, m.m);
|
||||
return os << u.u64[0] << " " << u.u64[1];
|
||||
}
|
||||
|
||||
R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
|
||||
uint64_t u64[2];
|
||||
is >> u64[0] >> u64[1];
|
||||
m.m = _mm_set_epi64x(u64[1], u64[0]);
|
||||
return is;
|
||||
}
|
||||
|
||||
template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
|
||||
|
||||
template <>
|
||||
inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
|
||||
r123m128i ret;
|
||||
ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
typedef struct {
|
||||
__m128i m;
|
||||
} r123m128i;
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#else /* !R123_USE_SSE */
|
||||
R123_STATIC_INLINE int haveAESNI(){
|
||||
return 0;
|
||||
}
|
||||
#endif /* R123_USE_SSE */
|
||||
|
||||
#endif /* _Random123_sse_dot_h__ */
|
172
external/panphasia_ho/features/sunprofeatures.h
vendored
Normal file
172
external/panphasia_ho/features/sunprofeatures.h
vendored
Normal file
|
@ -0,0 +1,172 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __sunprofeatures_dot_hpp
|
||||
#define __sunprofeatures_dot_hpp
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) expr
|
||||
#endif
|
||||
|
||||
// The basic idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #if some condition
|
||||
// #define R123_SOMETHING 1
|
||||
// #else
|
||||
// #define R123_SOMETHING 0
|
||||
// #endif
|
||||
// #endif
|
||||
// This idiom allows an external user to override any decision
|
||||
// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
|
||||
|
||||
// An alternative idiom is:
|
||||
// #ifndef R123_SOMETHING
|
||||
// #define R123_SOMETHING (some boolean expression)
|
||||
// #endif
|
||||
// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
|
||||
// pp-symbols.
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#define R123_USE_ASM_GNU 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
#define R123_USE_EMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#define R123_USE_INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO16_ASM
|
||||
#define R123_USE_MULHILO16_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#define R123_USE_MULHILO64_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_PHILOX_64BIT
|
||||
#define R123_USE_PHILOX_64BIT 0
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
// If you add something, it must go in all the other XXfeatures.hpp
|
||||
// and in ../ut_features.cpp
|
||||
#endif
|
210
external/panphasia_ho/features/xlcfeatures.h
vendored
Normal file
210
external/panphasia_ho/features/xlcfeatures.h
vendored
Normal file
|
@ -0,0 +1,210 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Copyright (c) 2013, Los Alamos National Security, LLC
|
||||
All rights reserved.
|
||||
|
||||
Copyright 2013. Los Alamos National Security, LLC. This software was produced
|
||||
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
|
||||
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
|
||||
the U.S. Department of Energy. The U.S. Government has rights to use,
|
||||
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
|
||||
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
|
||||
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
|
||||
to produce derivative works, such modified software should be clearly marked,
|
||||
so as not to confuse it with the version available from LANL.
|
||||
*/
|
||||
#ifndef __xlcfeatures_dot_hpp
|
||||
#define __xlcfeatures_dot_hpp
|
||||
|
||||
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
|
||||
# error "This code has only been tested on x86 and PowerPC platforms."
|
||||
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
|
||||
{ /* maybe an unbalanced brace will terminate the compilation */
|
||||
/* Feel free to try the Random123 library on other architectures by changing
|
||||
the conditions that reach this error, but you should consider it a
|
||||
porting exercise and expect to encounter bugs and deficiencies.
|
||||
Please let the authors know of any successes (or failures). */
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
/* builtins are automatically available to xlc. To use them with xlc++,
|
||||
one must include builtins.h. c.f
|
||||
http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
|
||||
*/
|
||||
#include <builtins.h>
|
||||
#endif
|
||||
|
||||
#ifndef R123_STATIC_INLINE
|
||||
#define R123_STATIC_INLINE static inline
|
||||
#endif
|
||||
|
||||
#ifndef R123_FORCE_INLINE
|
||||
#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
|
||||
#endif
|
||||
|
||||
#ifndef R123_CUDA_DEVICE
|
||||
#define R123_CUDA_DEVICE
|
||||
#endif
|
||||
|
||||
#ifndef R123_ASSERT
|
||||
#include <assert.h>
|
||||
#define R123_ASSERT(x) assert(x)
|
||||
#endif
|
||||
|
||||
#ifndef R123_BUILTIN_EXPECT
|
||||
#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_NI
|
||||
#define R123_USE_AES_NI 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_2
|
||||
#define R123_USE_SSE4_2 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE4_1
|
||||
#define R123_USE_SSE4_1 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SSE
|
||||
#define R123_USE_SSE 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_AES_OPENSSL
|
||||
/* There isn't really a good way to tell at compile time whether
|
||||
openssl is available. Without a pre-compilation configure-like
|
||||
tool, it's less error-prone to guess that it isn't available. Add
|
||||
-DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
|
||||
play with openssl */
|
||||
#define R123_USE_AES_OPENSSL 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_GNU_UINT128
|
||||
#define R123_USE_GNU_UINT128 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_ASM_GNU
|
||||
#define R123_USE_ASM_GNU 1
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_CPUID_MSVC
|
||||
#define R123_USE_CPUID_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_X86INTRIN_H
|
||||
#define R123_USE_X86INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_IA32INTRIN_H
|
||||
#define R123_USE_IA32INTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_XMMINTRIN_H
|
||||
#define R123_USE_XMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_EMMINTRIN_H
|
||||
#define R123_USE_EMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_SMMINTRIN_H
|
||||
#define R123_USE_SMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_WMMINTRIN_H
|
||||
#define R123_USE_WMMINTRIN_H 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_INTRIN_H
|
||||
#ifdef __ABM__
|
||||
#define R123_USE_INTRIN_H 1
|
||||
#else
|
||||
#define R123_USE_INTRIN_H 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_ASM
|
||||
#define R123_USE_MULHILO32_ASM 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MULHI_INTRIN
|
||||
#if (defined(__powerpc64__))
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 1
|
||||
#else
|
||||
#define R123_USE_MULHILO64_MULHI_INTRIN 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_MULHILO64_MULHI_INTRIN
|
||||
#define R123_MULHILO64_MULHI_INTRIN __mulhdu
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO32_MULHI_INTRIN
|
||||
#define R123_USE_MULHILO32_MULHI_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_MULHILO32_MULHI_INTRIN
|
||||
#define R123_MULHILO32_MULHI_INTRIN __mulhwu
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_ASM
|
||||
#if defined(__powerpc64__)
|
||||
#define R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
|
||||
#else
|
||||
#define R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(R123_USE_MULHILO64_MULHI_INTRIN))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_MSVC_INTRIN
|
||||
#define R123_USE_MULHILO64_MSVC_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_CUDA_INTRIN
|
||||
#define R123_USE_MULHILO64_CUDA_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
|
||||
#define R123_USE_MULHILO64_OPENCL_INTRIN 0
|
||||
#endif
|
||||
|
||||
#ifndef __STDC_CONSTANT_MACROS
|
||||
#define __STDC_CONSTANT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#ifndef UINT64_C
|
||||
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
|
||||
#endif
|
||||
|
||||
/* If you add something, it must go in all the other XXfeatures.hpp
|
||||
and in ../ut_features.cpp */
|
||||
#endif
|
1863
external/panphasia_ho/high_order_panphasia_routines.c
vendored
Normal file
1863
external/panphasia_ho/high_order_panphasia_routines.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
97
external/panphasia_ho/main.c
vendored
Normal file
97
external/panphasia_ho/main.c
vendored
Normal file
|
@ -0,0 +1,97 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <complex.h>
|
||||
#include <fftw3-mpi.h>
|
||||
|
||||
|
||||
|
||||
#include "PAN_FFTW3.h"
|
||||
#include "panphasia_functions.h"
|
||||
|
||||
extern size_t descriptor_base_size;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
#include <omp.h>
|
||||
int threads_ok;
|
||||
int number_omp_threads = 1;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
||||
int verbose=0;
|
||||
int error;
|
||||
size_t x0=0, y0=0, z0=0;
|
||||
size_t rel_level;
|
||||
char descriptor[300] = "[Panph6,L20,(424060,82570,148256),S1,KK0,CH-999,Auriga_100_vol2]";
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
omp_set_num_threads(number_omp_threads);
|
||||
int provided;
|
||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
|
||||
threads_ok = provided >= MPI_THREAD_FUNNELED;
|
||||
if (threads_ok) threads_ok = fftw_init_threads();
|
||||
fftw_mpi_init();
|
||||
int num_threads = number_omp_threads ;
|
||||
if (threads_ok){
|
||||
fftw_plan_with_nthreads(num_threads);
|
||||
}else{
|
||||
printf("Failure to initialise threads ...\n");
|
||||
MPI_Finalize();
|
||||
};
|
||||
|
||||
printf("OpenMP threads enabled with FFTW. Number of threads %d\n",fftw_planner_nthreads());
|
||||
#else
|
||||
MPI_Init(&argc, &argv);
|
||||
#endif
|
||||
|
||||
PANPHASIA_init_descriptor_(descriptor,&verbose);
|
||||
|
||||
|
||||
rel_level = 6; //Set size of test dataset
|
||||
|
||||
|
||||
if (error=PANPHASIA_init_level_(&rel_level,&x0,&y0,&z0,&verbose)){
|
||||
printf("Abort: PANPHASIA_init_level_ :error code %d\n",error);
|
||||
abort();
|
||||
};
|
||||
|
||||
//======================= FFTW ==============================
|
||||
|
||||
fftw_mpi_init();
|
||||
|
||||
ptrdiff_t alloc_local, local_n0, local_0_start;
|
||||
|
||||
ptrdiff_t N0 = descriptor_base_size<<rel_level;
|
||||
|
||||
alloc_local = FFTW_MPI_LOCAL_SIZE_3D(N0,N0,N0,MPI_COMM_WORLD,&local_n0,&local_0_start);
|
||||
|
||||
|
||||
FFTW_COMPLEX *Panphasia_White_Noise_Field;
|
||||
|
||||
Panphasia_White_Noise_Field = FFTW_ALLOC_COMPLEX(alloc_local);
|
||||
|
||||
|
||||
if (error = PANPHASIA_compute_kspace_field_(rel_level,N0, local_n0,local_0_start,Panphasia_White_Noise_Field)){
|
||||
printf("Error code from PANPHASIA_compute ... %d\n",error);
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
fftw_free(Panphasia_White_Noise_Field);
|
||||
|
||||
fftw_mpi_cleanup();
|
||||
//==================== End FFTW ===========================
|
||||
|
||||
|
||||
MPI_Finalize();
|
||||
}
|
||||
|
||||
|
||||
|
22
external/panphasia_ho/makefile
vendored
Normal file
22
external/panphasia_ho/makefile
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
CC = mpicc -qopenmp
|
||||
G99 = gcc -fopenmp
|
||||
|
||||
GSL_LIBS = -lgsl -lgslcblas
|
||||
|
||||
#CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi
|
||||
CCFLAGS = $(CFLAGS) $(GSL_LIBS) -O3 -qopt-zmm-usage=high -vec-threshold0 -lfftw3 -lfftw3f -lfftw3_omp -lfftw3_mpi -lfftw3f_mpi -DUSE_OPENMP
|
||||
|
||||
|
||||
|
||||
pan_fftw3_test_code.x: main.o high_order_panphasia_routines.o pan_mpi_routines.o uniform_rand_threefry4x64.o
|
||||
$(CC) $(CCFLAGS) *.o -o pan_fftw3_test_code.x
|
||||
main.o: main.c
|
||||
$(CC) $(CCFLAGS) -c main.c
|
||||
high_order_panphasia_routines.o: high_order_panphasia_routines.c
|
||||
$(CC) $(CCFLAGS) -c high_order_panphasia_routines.c
|
||||
pan_mpi_routines.o: pan_mpi_routines.c
|
||||
$(CC) $(CCFLAGS) -c pan_mpi_routines.c
|
||||
uniform_rand_threefry4x64.o: uniform_rand_threefry4x64.c
|
||||
$(CC) $(CCFLAGS) -c uniform_rand_threefry4x64.c
|
||||
clean:
|
||||
rm *.o
|
1737
external/panphasia_ho/pan_matrices_order6.h
vendored
Normal file
1737
external/panphasia_ho/pan_matrices_order6.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
344
external/panphasia_ho/pan_mpi_routines.c
vendored
Normal file
344
external/panphasia_ho/pan_mpi_routines.c
vendored
Normal file
|
@ -0,0 +1,344 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <complex.h>
|
||||
#include <fftw3-mpi.h>
|
||||
|
||||
#include "PAN_FFTW3.h"
|
||||
#include "panphasia_functions.h"
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
|
||||
extern const int Nbasis;
|
||||
extern const int irank_p[3][84];
|
||||
|
||||
extern size_t descriptor_order;
|
||||
extern size_t descriptor_kk_limit;
|
||||
|
||||
|
||||
int PANPHASIA_compute_kspace_field_(size_t relative_level, ptrdiff_t N0_grid,
|
||||
ptrdiff_t local_n0_return, ptrdiff_t local_0_start_return,
|
||||
FFTW_COMPLEX *return_field)
|
||||
{
|
||||
|
||||
size_t copy_list[Nbasis];
|
||||
int fdim=1;
|
||||
int pmax = 6;
|
||||
size_t ncopy = (pmax+1)*(pmax+2)*(pmax+3)/6;
|
||||
size_t xorigin=local_0_start_return, yorigin=0, zorigin=0;
|
||||
size_t xextent =local_n0_return, yextent = N0_grid, zextent = N0_grid;
|
||||
int verbose = 1;
|
||||
int flag_output_mode=2;
|
||||
int error;
|
||||
ptrdiff_t size_to_alloc;
|
||||
FFTW_PLAN output_coeff_forward_plan;
|
||||
|
||||
|
||||
if (pmax>descriptor_order) return(100000);
|
||||
|
||||
for (size_t i=0; i<Nbasis; i++) copy_list[i]=i;
|
||||
|
||||
printf("Dimensions of FT (%td,%td,%td)\n",N0_grid,N0_grid,N0_grid);
|
||||
printf("local_no %td local_0_start %td\n",local_n0_return, local_0_start_return);
|
||||
|
||||
|
||||
|
||||
// Distribution for ncopy 3-D arrays //
|
||||
{
|
||||
int rank =3;
|
||||
const ptrdiff_t ndimens_alloc[] = {N0_grid, N0_grid, N0_grid+2}; // Allocated for r2c
|
||||
ptrdiff_t howmany = ncopy;
|
||||
ptrdiff_t local_n0;
|
||||
ptrdiff_t local_0_start;
|
||||
|
||||
size_to_alloc = FFTW_MPI_LOCAL_SIZE_MANY(rank, ndimens_alloc, howmany,
|
||||
FFTW_MPI_DEFAULT_BLOCK,MPI_COMM_WORLD,
|
||||
&local_n0,&local_0_start);
|
||||
printf("size_to_alloc = %td\n",size_to_alloc);
|
||||
printf("cf value %ld\n",ncopy*xextent*yextent*zextent);
|
||||
printf("local_n0 %td local_0_start %td\n",local_n0,local_0_start);
|
||||
|
||||
};
|
||||
|
||||
void *output_coefficients= FFTW_MALLOC(sizeof(FFTW_REAL)*size_to_alloc);
|
||||
|
||||
if (output_coefficients==NULL) return(100001);
|
||||
|
||||
FFTW_REAL *ptr_real_output_coefficients = output_coefficients;
|
||||
FFTW_COMPLEX *ptr_cmplx_output_coefficients = output_coefficients;
|
||||
|
||||
|
||||
printf("Making the plan ... \n");
|
||||
|
||||
//////////////////// Make plan for ncopy interleaved FTs ///////////////////////////
|
||||
|
||||
{
|
||||
int rank = 3;
|
||||
const ptrdiff_t ndimens[3] = {N0_grid, N0_grid, N0_grid};
|
||||
ptrdiff_t howmany = ncopy;
|
||||
ptrdiff_t block = FFTW_MPI_DEFAULT_BLOCK;
|
||||
ptrdiff_t tblock = FFTW_MPI_DEFAULT_BLOCK;
|
||||
unsigned flags = FFTW_ESTIMATE;
|
||||
|
||||
output_coeff_forward_plan = FFTW_MPI_PLAN_MANY_DTF_R2C(rank, ndimens,
|
||||
howmany, block, tblock,
|
||||
ptr_real_output_coefficients, ptr_cmplx_output_coefficients,
|
||||
MPI_COMM_WORLD, flags);
|
||||
if (output_coeff_forward_plan==NULL) {
|
||||
printf("Null plan\n");
|
||||
return(100051);
|
||||
};
|
||||
|
||||
};
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
printf("Plan completed ... \n");
|
||||
|
||||
printf("xorigin,yorigin,zorigin (%ld,%ld,%ld)\n ",xorigin,yorigin,zorigin);
|
||||
printf("xextent,yextent,zextent (%ld,%ld,%ld)\n ",xextent,yextent,zextent);
|
||||
|
||||
|
||||
|
||||
if (error = PANPHASIA_compute_coefficients_(&xorigin,&yorigin,&zorigin,&xextent,&yextent,
|
||||
&zextent, copy_list, &ncopy,
|
||||
ptr_real_output_coefficients,&flag_output_mode,&verbose)){
|
||||
return(100100+error);
|
||||
};
|
||||
|
||||
for (int j=0; j<4; j++){
|
||||
for (int i=0; i<4; i++) printf("(%lf ) ",ptr_real_output_coefficients[j+ i*ncopy]);
|
||||
printf("\n");
|
||||
};
|
||||
|
||||
{
|
||||
|
||||
size_t nfft_dim;
|
||||
nfft_dim = N0_grid;
|
||||
int rank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
|
||||
char filename[100];
|
||||
sprintf(filename,"output_real_space_field.%d",rank);
|
||||
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(filename,"w");
|
||||
|
||||
for (int ix=0; ix<local_n0_return; ix++)
|
||||
for (int iy=0; iy < nfft_dim; iy++)
|
||||
for (int iz=0; iz < nfft_dim; iz++){
|
||||
int index = ix*N0_grid*(N0_grid+2) + iy*(N0_grid+2) + iz;
|
||||
fprintf(fp,"%6d%6d%6d %14.8lf %d\n",ix+local_0_start_return,iy,iz,
|
||||
ptr_real_output_coefficients[index],index);
|
||||
};
|
||||
|
||||
fclose(fp);
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
FFTW_MPI_EXECUTE_DFT_R2C(output_coeff_forward_plan,ptr_real_output_coefficients,ptr_cmplx_output_coefficients);
|
||||
|
||||
|
||||
for (int j=0; j<4; j++){
|
||||
for (int i=0; i<4; i++) printf("(%lf %lf) ",creal(ptr_cmplx_output_coefficients[j+ i*ncopy]),
|
||||
cimag(ptr_cmplx_output_coefficients[j + i*ncopy])); printf("\n");
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// Compute 1-D Spherical Bessel coefficients for each order.
|
||||
size_t nfft_dim;
|
||||
nfft_dim = N0_grid;
|
||||
if (nfft_dim<N0_grid) nfft_dim=N0_grid; if (nfft_dim<N0_grid) nfft_dim=N0_grid;
|
||||
size_t n4dimen;
|
||||
n4dimen=(nfft_dim%4==0) ? 4*(nfft_dim/4)+4 : 4*(nfft_dim/4)+5;
|
||||
double complex *sph_bessel_coeff = FFTW_MALLOC(sizeof(double complex)*n4dimen*(pmax+1));
|
||||
|
||||
compute_sph_bessel_coeffs(nfft_dim, pmax, n4dimen, fdim, sph_bessel_coeff);
|
||||
|
||||
printf("Reached here! ndimen4 %ld\n",n4dimen);
|
||||
|
||||
|
||||
|
||||
|
||||
{
|
||||
size_t index1,index2;
|
||||
complex weight;
|
||||
size_t p_total = (pmax+1)*(pmax+2)*(pmax+3)/6;
|
||||
int m;
|
||||
memset(return_field,0, local_n0_return*N0_grid*N0_grid * sizeof(FFTW_COMPLEX));
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
#pragma omp parallel for collapse(3) \
|
||||
private (index1,index2,weight,m)
|
||||
#endif
|
||||
for(int ix=0;ix<local_n0_return;ix++)
|
||||
for(int iy=0;iy<nfft_dim;iy++)
|
||||
for(int iz=0;iz<=nfft_dim/2;iz++){
|
||||
index1 = ix*N0_grid*(N0_grid/2+1) + iy*(N0_grid/2+1) + iz;
|
||||
for (int m=0; m<p_total; m++){
|
||||
index2 = p_total*index1 + m;
|
||||
weight = sph_bessel_coeff[n4dimen*irank_p[0][m]+ix+local_0_start_return]*
|
||||
sph_bessel_coeff[n4dimen*irank_p[1][m]+iy]*
|
||||
sph_bessel_coeff[n4dimen*irank_p[2][m]+iz];
|
||||
return_field[index1] += weight * ptr_cmplx_output_coefficients[index2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
printf("Reached here 10!\n");
|
||||
|
||||
//Add phase shift and normalise field
|
||||
{
|
||||
|
||||
double complex phase_shift_and_scale;
|
||||
int kx,ky,kz;
|
||||
const double pi = 4.0 * atan(1.0);
|
||||
size_t index1;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
#pragma omp parallel for collapse(3) \
|
||||
private (index1,kx,ky,kz,phase_shift_and_scale)
|
||||
#endif
|
||||
for(int ix=0;ix<local_n0_return;ix++)
|
||||
for(int iy=0;iy<nfft_dim;iy++)
|
||||
for(int iz=0;iz<=nfft_dim/2;iz++){
|
||||
index1 = ix*N0_grid*(N0_grid/2+1) + iy*(N0_grid/2+1) + iz;
|
||||
kx = (ix+local_0_start_return>nfft_dim/2) ?
|
||||
ix + local_0_start_return - nfft_dim : ix + local_0_start_return;
|
||||
ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy;
|
||||
kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz;
|
||||
|
||||
if ( (kx==nfft_dim/2)||(ky==nfft_dim/2)||(kz==nfft_dim/2)){
|
||||
// Set Nyquist modes to zero - not used by IC_Gen anyway.
|
||||
phase_shift_and_scale = 0.0; //1.0/pow((double)nfft_dim,1.5); // No phase shift
|
||||
}else{
|
||||
phase_shift_and_scale =
|
||||
cexp( (-I)*pi*(double)(kx + ky + kz)/(double)nfft_dim)/pow((double)nfft_dim,1.5);
|
||||
};
|
||||
|
||||
return_field[index1] *= phase_shift_and_scale;
|
||||
|
||||
};
|
||||
|
||||
|
||||
};
|
||||
|
||||
printf("Reached here 11!\n");
|
||||
|
||||
|
||||
|
||||
// Rescale selected Fourier modes to unit amplitude.
|
||||
// By default this part is not executed.
|
||||
|
||||
if (descriptor_kk_limit>0){
|
||||
size_t index1;
|
||||
complex weight;
|
||||
size_t ksquared;
|
||||
int kx,ky,kz;
|
||||
#ifdef USE_OPENMP
|
||||
#pragma omp parallel for collapse(3) \
|
||||
private (index1,kx,ky,kz,ksquared,weight)
|
||||
#endif
|
||||
for(int ix=0;ix<local_n0_return;ix++)
|
||||
for(int iy=0;iy<nfft_dim;iy++)
|
||||
for(int iz=0;iz<=nfft_dim/2;iz++){
|
||||
kx = (ix+local_0_start_return>nfft_dim/2) ?
|
||||
ix + local_0_start_return - nfft_dim : ix + local_0_start_return;
|
||||
ky = (iy > nfft_dim/2) ? iy-nfft_dim : iy;
|
||||
kz = (iz > nfft_dim/2) ? iz-nfft_dim : iz;
|
||||
ksquared = kx*kx + ky*ky + kz*kz;
|
||||
if (ksquared<=descriptor_kk_limit){
|
||||
index1 = ix*N0_grid*(N0_grid/2+1) + iy*(N0_grid/2+1) + iz;
|
||||
weight = cabs(return_field[index1]);
|
||||
return_field[index1] /= weight;
|
||||
};
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
printf("Reached here 12!\n");
|
||||
|
||||
|
||||
if (nfft_dim <128){
|
||||
|
||||
int rank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
|
||||
char filename[100];
|
||||
sprintf(filename,"output_k_space_field.%d",rank);
|
||||
|
||||
int xuse,yuse,zuse;
|
||||
FFTW_REAL sign;
|
||||
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(filename,"w");
|
||||
|
||||
for (int ix=0; ix<local_n0_return; ix++)
|
||||
for (int iy=0; iy < nfft_dim; iy++)
|
||||
for (int iz=0; iz < nfft_dim; iz++){
|
||||
|
||||
|
||||
if (iz>nfft_dim/2){
|
||||
xuse = (nfft_dim-ix)%nfft_dim;
|
||||
yuse = (nfft_dim-iy)%nfft_dim;
|
||||
zuse = (nfft_dim-iz)%nfft_dim;
|
||||
sign = -1.0;
|
||||
}else{
|
||||
xuse = ix;
|
||||
yuse = iy;
|
||||
zuse = iz;
|
||||
sign = 1.0;
|
||||
};
|
||||
|
||||
int index = xuse*N0_grid*(N0_grid/2+1) + yuse*(N0_grid/2+1) + zuse;
|
||||
fprintf(fp,"%6d%6d%6d %14.8lf %14.8lf\n",ix+local_0_start_return,iy,iz,
|
||||
creal(return_field[index]),cimag(sign*return_field[index]));
|
||||
};
|
||||
fclose(fp);
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
printf("Reached here 14!\n");
|
||||
|
||||
for (int j=0; j<4; j++){
|
||||
for (int i=0; i<4; i++) printf("(%lf %lf) ",creal(return_field[j+ i*ncopy]),
|
||||
cimag(return_field[j + i*ncopy])); printf("\n");
|
||||
};
|
||||
|
||||
|
||||
|
||||
FFTW_FREE(output_coefficients);
|
||||
FFTW_FREE(sph_bessel_coeff);
|
||||
|
||||
|
||||
FFTW_DESTROY_PLAN(output_coeff_forward_plan);
|
||||
|
||||
printf("Reached here! 3 \n");
|
||||
return(0);
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
//==========================================================================================
|
||||
//==========================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
93
external/panphasia_ho/panphasia_functions.h
vendored
Normal file
93
external/panphasia_ho/panphasia_functions.h
vendored
Normal file
|
@ -0,0 +1,93 @@
|
|||
|
||||
/////////////////////////////////////////////////
|
||||
// By default Panphasia is computed at single
|
||||
// precision. To override this define PAN_DOUBLE
|
||||
|
||||
#define PAN_DOUBLE_PRECISION 8
|
||||
|
||||
|
||||
#ifndef PAN_DOUBLE_PRECISION
|
||||
|
||||
#define PAN_REAL float
|
||||
#define PAN_COMPLEX float complex
|
||||
|
||||
#else
|
||||
|
||||
#define PAN_REAL double
|
||||
#define PAN_COMPLEX double complex
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
void return_uniform_pseudo_rands_threefry4x64_(size_t l,size_t j1,size_t j2,size_t j3,
|
||||
PAN_REAL *panphasia_randoms, size_t seed_value,
|
||||
size_t allow_non_zero_seed_safety_catch);
|
||||
|
||||
void box_muller_(PAN_REAL *unif_real,PAN_REAL *gvar);
|
||||
|
||||
void solve_panphasia_cell_(PAN_REAL *input_vec_parent, PAN_REAL *input_vec_children, PAN_REAL *output_cell_vec, int control_flag);
|
||||
|
||||
void threefry4x64_test_(int verbose);
|
||||
void inverse_threefry4x64_test_(int verbose);
|
||||
void set_panphasia_key_(int verbose);
|
||||
void check_panphasia_key_(int verbose);
|
||||
|
||||
void PANPHASIA_init_descriptor_checks();
|
||||
|
||||
void speed_test_();
|
||||
void speed_test2_();
|
||||
void check_randoms_();
|
||||
void test_random_dist_(size_t shift);
|
||||
void compute_all_properties_of_a_panphasia_cell_(size_t *level, size_t *j1, size_t *j2, size_t *j3,
|
||||
PAN_REAL *gauss_rand_parent, PAN_REAL *legendre_rand);
|
||||
void return_root_legendre_coefficients_(PAN_REAL *root);
|
||||
|
||||
|
||||
int parse_and_validate_descriptor_(char *);
|
||||
int demo_descriptor_();
|
||||
long long int compute_check_digit_();
|
||||
int PANPHASIA_init_descriptor_(char *descriptor, int *verbose);
|
||||
int PANPHASIA_init_level_(size_t *oct_level, size_t *rel_orig_x, size_t *rel_orig_y,size_t *rel_orig_z,int *verbose);
|
||||
|
||||
|
||||
int PANPHASIA_compute_coefficients_(size_t *xstart, size_t *ystart, size_t*zstart,
|
||||
size_t *xextent, size_t *yextent, size_t *zextend,
|
||||
size_t *copy_list,
|
||||
size_t *ncopy, void *output_values, int *flag_output_mode, int *verbose);
|
||||
|
||||
void test_moments_();
|
||||
void test_propogation_of_moments_(int iterations);
|
||||
void test_cell_moments(char *,size_t, size_t, size_t, size_t, size_t, double *);
|
||||
|
||||
void spherical_bessel_(int *, double *, double *);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void calc_absolute_coordinates(size_t xrel, size_t yrel, size_t zrel,size_t *xabs, size_t *yabs,size_t *zabs);
|
||||
|
||||
int cell_information(size_t cell_id, size_t *cumulative_cell_index, size_t *cuboid_x_dimen,
|
||||
size_t *cuboid_y_dimen,size_t *cuboid_z_dimen, size_t *cell_lev,
|
||||
size_t *cell_x, size_t *cell_y, size_t *cell_z, size_t number_children,
|
||||
size_t *child_cell_indices);
|
||||
|
||||
int return_binary_tree_cell_lists(size_t level_max, size_t *list_cell_coordinates,
|
||||
size_t extent, size_t *return_tree_list_coordinates,
|
||||
size_t nreturn,
|
||||
long long int *child_pointer, size_t *level_count, size_t *level_begin, size_t *index_perm);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void compute_sph_bessel_coeffs(int, int, int, int, double complex *);
|
||||
|
||||
int PANPHASIA_compute_kspace_field_(size_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, FFTW_COMPLEX *);
|
874
external/panphasia_ho/threefry.h
vendored
Normal file
874
external/panphasia_ho/threefry.h
vendored
Normal file
|
@ -0,0 +1,874 @@
|
|||
/*
|
||||
Copyright 2010-2011, D. E. Shaw Research.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of D. E. Shaw Research nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef _threefry_dot_h_
|
||||
#define _threefry_dot_h_
|
||||
#include "features/compilerfeatures.h"
|
||||
#include "array.h"
|
||||
|
||||
/** \cond HIDDEN_FROM_DOXYGEN */
|
||||
/* Significant parts of this file were copied from
|
||||
from:
|
||||
Skein_FinalRnd/ReferenceImplementation/skein.h
|
||||
Skein_FinalRnd/ReferenceImplementation/skein_block.c
|
||||
|
||||
in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
|
||||
|
||||
This file has been modified so that it may no longer perform its originally
|
||||
intended function. If you're looking for a Skein or Threefish source code,
|
||||
please consult the original file.
|
||||
|
||||
The original file had the following header:
|
||||
**************************************************************************
|
||||
**
|
||||
** Interface declarations and internal definitions for Skein hashing.
|
||||
**
|
||||
** Source code author: Doug Whiting, 2008.
|
||||
**
|
||||
** This algorithm and source code is released to the public domain.
|
||||
**
|
||||
***************************************************************************
|
||||
|
||||
*/
|
||||
|
||||
/* See comment at the top of philox.h for the macro pre-process
|
||||
strategy. */
|
||||
|
||||
/* Rotation constants: */
|
||||
enum r123_enum_threefry64x4 {
|
||||
/* These are the R_256 constants from the Threefish reference sources
|
||||
with names changed to R_64x4... */
|
||||
R_64x4_0_0=14, R_64x4_0_1=16,
|
||||
R_64x4_1_0=52, R_64x4_1_1=57,
|
||||
R_64x4_2_0=23, R_64x4_2_1=40,
|
||||
R_64x4_3_0= 5, R_64x4_3_1=37,
|
||||
R_64x4_4_0=25, R_64x4_4_1=33,
|
||||
R_64x4_5_0=46, R_64x4_5_1=12,
|
||||
R_64x4_6_0=58, R_64x4_6_1=22,
|
||||
R_64x4_7_0=32, R_64x4_7_1=32
|
||||
};
|
||||
|
||||
enum r123_enum_threefry64x2 {
|
||||
/*
|
||||
// Output from skein_rot_search: (srs64_B64-X1000)
|
||||
// Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
|
||||
// Start: Tue Mar 1 10:07:48 2011
|
||||
// rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
|
||||
*/
|
||||
R_64x2_0_0=16,
|
||||
R_64x2_1_0=42,
|
||||
R_64x2_2_0=12,
|
||||
R_64x2_3_0=31,
|
||||
R_64x2_4_0=16,
|
||||
R_64x2_5_0=32,
|
||||
R_64x2_6_0=24,
|
||||
R_64x2_7_0=21
|
||||
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
|
||||
// 5 rounds: minHW = 8 [ 8 8 8 8 ]
|
||||
// 6 rounds: minHW = 16 [ 16 16 16 16 ]
|
||||
// 7 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
// 8 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||
// 9 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||
//10 rounds: minHW = 64 [ 64 64 64 64 ]
|
||||
//11 rounds: minHW = 64 [ 64 64 64 64 ] */
|
||||
};
|
||||
|
||||
enum r123_enum_threefry32x4 {
|
||||
/* Output from skein_rot_search: (srs-B128-X5000.out)
|
||||
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
|
||||
// Start: Mon Aug 24 22:41:36 2009
|
||||
// ...
|
||||
// rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
|
||||
R_32x4_0_0=10, R_32x4_0_1=26,
|
||||
R_32x4_1_0=11, R_32x4_1_1=21,
|
||||
R_32x4_2_0=13, R_32x4_2_1=27,
|
||||
R_32x4_3_0=23, R_32x4_3_1= 5,
|
||||
R_32x4_4_0= 6, R_32x4_4_1=20,
|
||||
R_32x4_5_0=17, R_32x4_5_1=11,
|
||||
R_32x4_6_0=25, R_32x4_6_1=10,
|
||||
R_32x4_7_0=18, R_32x4_7_1=20
|
||||
|
||||
/* 4 rounds: minHW = 3 [ 3 3 3 3 ]
|
||||
// 5 rounds: minHW = 7 [ 7 7 7 7 ]
|
||||
// 6 rounds: minHW = 12 [ 13 12 13 12 ]
|
||||
// 7 rounds: minHW = 22 [ 22 23 22 23 ]
|
||||
// 8 rounds: minHW = 31 [ 31 31 31 31 ]
|
||||
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
//10 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
|
||||
|
||||
};
|
||||
|
||||
enum r123_enum_threefry32x2 {
|
||||
/* Output from skein_rot_search (srs32x2-X5000.out)
|
||||
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
|
||||
// Start: Tue Jul 12 11:11:33 2011
|
||||
// rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
|
||||
R_32x2_0_0=13,
|
||||
R_32x2_1_0=15,
|
||||
R_32x2_2_0=26,
|
||||
R_32x2_3_0= 6,
|
||||
R_32x2_4_0=17,
|
||||
R_32x2_5_0=29,
|
||||
R_32x2_6_0=16,
|
||||
R_32x2_7_0=24
|
||||
|
||||
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
|
||||
// 5 rounds: minHW = 6 [ 6 8 6 8 ]
|
||||
// 6 rounds: minHW = 9 [ 9 12 9 12 ]
|
||||
// 7 rounds: minHW = 16 [ 16 24 16 24 ]
|
||||
// 8 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
//10 rounds: minHW = 32 [ 32 32 32 32 ]
|
||||
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
|
||||
};
|
||||
|
||||
enum r123_enum_threefry_wcnt {
|
||||
WCNT2=2,
|
||||
WCNT4=4
|
||||
};
|
||||
|
||||
#if R123_USE_64BIT
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
|
||||
{
|
||||
return (x << (N & 63)) | (x >> ((64-N) & 63));
|
||||
}
|
||||
#endif
|
||||
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
|
||||
{
|
||||
return (x << (N & 31)) | (x >> ((32-N) & 31));
|
||||
}
|
||||
|
||||
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
|
||||
#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
|
||||
#define SKEIN_KS_PARITY32 0x1BD11BDA
|
||||
|
||||
/** \endcond */
|
||||
|
||||
#ifndef THREEFRY2x32_DEFAULT_ROUNDS
|
||||
#define THREEFRY2x32_DEFAULT_ROUNDS 20
|
||||
#endif
|
||||
|
||||
#ifndef THREEFRY2x64_DEFAULT_ROUNDS
|
||||
#define THREEFRY2x64_DEFAULT_ROUNDS 20
|
||||
#endif
|
||||
|
||||
#ifndef THREEFRY4x32_DEFAULT_ROUNDS
|
||||
#define THREEFRY4x32_DEFAULT_ROUNDS 20
|
||||
#endif
|
||||
|
||||
#ifndef THREEFRY4x64_DEFAULT_ROUNDS
|
||||
#define THREEFRY4x64_DEFAULT_ROUNDS 20
|
||||
#endif
|
||||
|
||||
#define _threefry2x_tpl(W) \
|
||||
typedef struct r123array2x##W threefry2x##W##_ctr_t; \
|
||||
typedef struct r123array2x##W threefry2x##W##_key_t; \
|
||||
typedef struct r123array2x##W threefry2x##W##_ukey_t; \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||
threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
|
||||
uint##W##_t X0,X1; \
|
||||
uint##W##_t ks0, ks1, ks2; \
|
||||
R123_ASSERT(Nrounds<=32); \
|
||||
ks2 = SKEIN_KS_PARITY##W; \
|
||||
ks0 = k.v[0]; \
|
||||
X0 = in.v[0] + ks0; \
|
||||
ks2 ^= ks0; \
|
||||
\
|
||||
ks1 = k.v[1]; \
|
||||
X1 = in.v[1] + ks1; \
|
||||
ks2 ^= ks1; \
|
||||
\
|
||||
if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||
if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||
if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||
if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||
if(Nrounds>3){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks1; X1 += ks2; \
|
||||
X1 += 1; /* X.v[2-1] += r */ \
|
||||
} \
|
||||
if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||
if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||
if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||
if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||
if(Nrounds>7){ \
|
||||
/* InjectKey(r=2) */ \
|
||||
X0 += ks2; X1 += ks0; \
|
||||
X1 += 2; \
|
||||
} \
|
||||
if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||
if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||
if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||
if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||
if(Nrounds>11){ \
|
||||
/* InjectKey(r=3) */ \
|
||||
X0 += ks0; X1 += ks1; \
|
||||
X1 += 3; \
|
||||
} \
|
||||
if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||
if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||
if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||
if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||
if(Nrounds>15){ \
|
||||
/* InjectKey(r=4) */ \
|
||||
X0 += ks1; X1 += ks2; \
|
||||
X1 += 4; \
|
||||
} \
|
||||
if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||
if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||
if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||
if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||
if(Nrounds>19){ \
|
||||
/* InjectKey(r=5) */ \
|
||||
X0 += ks2; X1 += ks0; \
|
||||
X1 += 5; \
|
||||
} \
|
||||
if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||
if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||
if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||
if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||
if(Nrounds>23){ \
|
||||
/* InjectKey(r=6) */ \
|
||||
X0 += ks0; X1 += ks1; \
|
||||
X1 += 6; \
|
||||
} \
|
||||
if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
|
||||
if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
|
||||
if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
|
||||
if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
|
||||
if(Nrounds>27){ \
|
||||
/* InjectKey(r=7) */ \
|
||||
X0 += ks1; X1 += ks2; \
|
||||
X1 += 7; \
|
||||
} \
|
||||
if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
|
||||
if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
|
||||
if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
|
||||
if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
|
||||
if(Nrounds>31){ \
|
||||
/* InjectKey(r=8) */ \
|
||||
X0 += ks2; X1 += ks0; \
|
||||
X1 += 8; \
|
||||
} \
|
||||
threefry2x##W##_ctr_t ret={{X0, X1}}; \
|
||||
return ret; \
|
||||
} \
|
||||
/** @ingroup ThreefryNxW */ \
|
||||
enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||
threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
|
||||
return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
|
||||
}
|
||||
|
||||
|
||||
#define _threefry4x_tpl(W) \
|
||||
typedef struct r123array4x##W threefry4x##W##_ctr_t; \
|
||||
typedef struct r123array4x##W threefry4x##W##_key_t; \
|
||||
typedef struct r123array4x##W threefry4x##W##_ukey_t; \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||
threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
|
||||
uint##W##_t X0, X1, X2, X3; \
|
||||
uint##W##_t ks0, ks1, ks2, ks3, ks4; \
|
||||
R123_ASSERT(Nrounds<=72); \
|
||||
ks4 = SKEIN_KS_PARITY##W; \
|
||||
ks0 = k.v[0]; \
|
||||
X0 = in.v[0] + ks0; \
|
||||
ks4 ^= ks0; \
|
||||
\
|
||||
ks1 = k.v[1]; \
|
||||
X1 = in.v[1] + ks1; \
|
||||
ks4 ^= ks1; \
|
||||
\
|
||||
ks2 = k.v[2]; \
|
||||
X2 = in.v[2] + ks2; \
|
||||
ks4 ^= ks2; \
|
||||
\
|
||||
ks3 = k.v[3]; \
|
||||
X3 = in.v[3] + ks3; \
|
||||
ks4 ^= ks3; \
|
||||
\
|
||||
if(Nrounds>0){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>1){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>2){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>3){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>3){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||
X3 += 1; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>4){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>5){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>6){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>7){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>7){ \
|
||||
/* InjectKey(r=2) */ \
|
||||
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||
X3 += 2; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>8){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>9){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>10){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>11){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>11){ \
|
||||
/* InjectKey(r=3) */ \
|
||||
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||
X3 += 3; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>12){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>13){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>14){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>15){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>15){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||
X3 += 4; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>16){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>17){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>18){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>19){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>19){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||
X3 += 5; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>20){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>21){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>22){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>23){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>23){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||
X3 += 6; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>24){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>25){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>26){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>27){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>27){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||
X3 += 7; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>28){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>29){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>30){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>31){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>31){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||
X3 += 8; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>32){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>33){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>34){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>35){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>35){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||
X3 += 9; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>36){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>37){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>38){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>39){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>39){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||
X3 += 10; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>40){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>41){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>42){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>43){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>43){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||
X3 += 11; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>44){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>45){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>46){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>47){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>47){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||
X3 += 12; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>48){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>49){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>50){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>51){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>51){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||
X3 += 13; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>52){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>53){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>54){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>55){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>55){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
|
||||
X3 += 14; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>56){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>57){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>58){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>59){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>59){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
|
||||
X3 += 15; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>60){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>61){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>62){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>63){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>63){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
|
||||
X3 += 16; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>64){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>65){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>66){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>67){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>67){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
|
||||
X3 += 17; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
if(Nrounds>68){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>69){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>70){ \
|
||||
X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
|
||||
X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>71){ \
|
||||
X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
|
||||
X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
|
||||
} \
|
||||
if(Nrounds>71){ \
|
||||
/* InjectKey(r=1) */ \
|
||||
X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
|
||||
X3 += 18; /* XWCNT4-1 += r */ \
|
||||
} \
|
||||
\
|
||||
threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
|
||||
return ret; \
|
||||
} \
|
||||
\
|
||||
/** @ingroup ThreefryNxW */ \
|
||||
enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
|
||||
R123_CUDA_DEVICE R123_STATIC_INLINE \
|
||||
threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
|
||||
return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
|
||||
}
|
||||
|
||||
#if R123_USE_64BIT
|
||||
_threefry2x_tpl(64)
|
||||
_threefry4x_tpl(64)
|
||||
#endif
|
||||
_threefry2x_tpl(32)
|
||||
_threefry4x_tpl(32)
|
||||
|
||||
/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
|
||||
than a static inline function. Why? */
|
||||
#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
|
||||
#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
|
||||
#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
|
||||
#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#define _threefryNxWclass_tpl(NxW) \
|
||||
namespace r123{ \
|
||||
template<unsigned int ROUNDS> \
|
||||
struct Threefry##NxW##_R{ \
|
||||
typedef threefry##NxW##_ctr_t ctr_type; \
|
||||
typedef threefry##NxW##_key_t key_type; \
|
||||
typedef threefry##NxW##_key_t ukey_type; \
|
||||
static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
|
||||
inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
|
||||
R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
|
||||
return threefry##NxW##_R(ROUNDS, ctr, key); \
|
||||
} \
|
||||
}; \
|
||||
typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
|
||||
} // namespace r123
|
||||
|
||||
_threefryNxWclass_tpl(2x32)
|
||||
_threefryNxWclass_tpl(4x32)
|
||||
#if R123_USE_64BIT
|
||||
_threefryNxWclass_tpl(2x64)
|
||||
_threefryNxWclass_tpl(4x64)
|
||||
#endif
|
||||
|
||||
/* The _tpl macros don't quite work to do string-pasting inside comments.
|
||||
so we just write out the boilerplate documentation four times... */
|
||||
|
||||
/**
|
||||
@defgroup ThreefryNxW Threefry Classes and Typedefs
|
||||
|
||||
The ThreefryNxW classes export the member functions, typedefs and
|
||||
operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||
|
||||
As described in
|
||||
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
|
||||
the Threefry family is closely related to the Threefish block cipher from
|
||||
<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
|
||||
Threefry is \b not suitable for cryptographic use.
|
||||
|
||||
Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
|
||||
|
||||
@class r123::Threefry2x32_R
|
||||
@ingroup ThreefryNxW
|
||||
|
||||
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||
|
||||
The template argument, ROUNDS, is the number of times the Threefry round
|
||||
function will be applied.
|
||||
|
||||
As of September 2011, the authors know of no statistical flaws with
|
||||
ROUNDS=13 or more for Threefry2x32.
|
||||
|
||||
@typedef r123::Threefry2x32
|
||||
@ingroup ThreefryNxW
|
||||
Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
|
||||
Threefry2x32 has a considerable safety margin over the minimum number
|
||||
of rounds with no known statistical flaws, but still has excellent
|
||||
performance.
|
||||
|
||||
@class r123::Threefry2x64_R
|
||||
@ingroup ThreefryNxW
|
||||
|
||||
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||
|
||||
The template argument, ROUNDS, is the number of times the Threefry round
|
||||
function will be applied.
|
||||
|
||||
In November 2011, the authors discovered that 13 rounds of
|
||||
Threefry2x64 sequenced by strided, interleaved key and counter
|
||||
increments failed a very long (longer than the default BigCrush
|
||||
length) WeightDistrub test. At the same time, it was confirmed that
|
||||
14 rounds passes much longer tests (up to 5x10^12 samples) of a
|
||||
similar nature. The authors know of no statistical flaws with
|
||||
ROUNDS=14 or more for Threefry2x64.
|
||||
|
||||
@typedef r123::Threefry2x64
|
||||
@ingroup ThreefryNxW
|
||||
Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
|
||||
Threefry2x64 has a considerable safety margin over the minimum number
|
||||
of rounds with no known statistical flaws, but still has excellent
|
||||
performance.
|
||||
|
||||
|
||||
|
||||
@class r123::Threefry4x32_R
|
||||
@ingroup ThreefryNxW
|
||||
|
||||
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||
|
||||
The template argument, ROUNDS, is the number of times the Threefry round
|
||||
function will be applied.
|
||||
|
||||
As of September 2011, the authors know of no statistical flaws with
|
||||
ROUNDS=12 or more for Threefry4x32.
|
||||
|
||||
@typedef r123::Threefry4x32
|
||||
@ingroup ThreefryNxW
|
||||
Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
|
||||
Threefry4x32 has a considerable safety margin over the minimum number
|
||||
of rounds with no known statistical flaws, but still has excellent
|
||||
performance.
|
||||
|
||||
|
||||
|
||||
@class r123::Threefry4x64_R
|
||||
@ingroup ThreefryNxW
|
||||
|
||||
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
|
||||
|
||||
The template argument, ROUNDS, is the number of times the Threefry round
|
||||
function will be applied.
|
||||
|
||||
As of September 2011, the authors know of no statistical flaws with
|
||||
ROUNDS=12 or more for Threefry4x64.
|
||||
|
||||
@typedef r123::Threefry4x64
|
||||
@ingroup ThreefryNxW
|
||||
Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
|
||||
Threefry4x64 has a considerable safety margin over the minimum number
|
||||
of rounds with no known statistical flaws, but still has excellent
|
||||
performance.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
1033
external/panphasia_ho/uniform_rand_threefry4x64.c
vendored
Normal file
1033
external/panphasia_ho/uniform_rand_threefry4x64.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue