/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "limits.h" #include "const.h" #include "macros.h" namespace ROOT { namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace AVX { /////////////////////////////////////////////////////////////////////////////////////////// // constants {{{1 template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : d(HV::load(IndexesFromZeroData::address(), Aligned)) {} template Vc_INTRINSIC Vector Vc_CONST Vector::Zero() { return HT::zero(); } template Vc_INTRINSIC Vector Vc_CONST Vector::One() { return HT::one(); } template Vc_INTRINSIC Vector Vc_CONST Vector::IndexesFromZero() { return HV::load(IndexesFromZeroData::address(), Aligned); } template template Vc_ALWAYS_INLINE Vector::Vector(VC_ALIGNED_PARAMETER(Vector) x) : d(StaticCastHelper::cast(x.data())) {} template Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(HT::set(x)) {} template<> Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} /////////////////////////////////////////////////////////////////////////////////////////// // load ctors {{{1 template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 template Vc_INTRINSIC void Vector::load(const EntryType *mem) { load(mem, Aligned); } template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) { d.v() = HV::load(mem, align); } template template Vc_INTRINSIC void Vector::load(const OtherT *mem) { load(mem, Aligned); } // LoadHelper {{{2 template struct LoadHelper; // float {{{2 template struct LoadHelper { static m256 load(const double *mem, Flags f) { return concat(_mm256_cvtpd_ps(VectorHelper::load(&mem[0], f)), _mm256_cvtpd_ps(VectorHelper::load(&mem[4], f))); } }; template struct LoadHelper { static m256 load(const unsigned int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const signed char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper : public LoadHelper {}; // int {{{2 template struct LoadHelper { static m256i load(const unsigned int *mem, Flags f) { return VectorHelper::load(mem, f); } }; template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; template struct LoadHelper { static m256i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epi16 = _mm_cvtepi8_epi16(epi8); return StaticCastHelper::cast(epi16); } }; // unsigned int {{{2 template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; // short {{{2 template struct LoadHelper { static m128i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; template struct LoadHelper { static m128i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepi8_epi16(epi8); } }; // unsigned short {{{2 template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) { d.v() = LoadHelper::load(x, f); } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = HV::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = HV::andnot_(avx_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template Vc_INTRINSIC void Vector::store(EntryType *mem) const { HV::store(mem, data(), Aligned); } template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const { HV::store(mem, data(), avx_cast(mask.data()), Aligned); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const { HV::store(mem, data(), align); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const { HV::store(mem, data(), avx_cast(mask.data()), align); } /////////////////////////////////////////////////////////////////////////////////////////// // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 template Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(a[0]) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) { } template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0] = *this; } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = _mm256_cvtps_pd(lo128(d.v())); x[1].data() = _mm256_cvtps_pd(hi128(d.v())); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles {{{1 template Vc_INTRINSIC const Vector Vc_PURE &Vector::abcd() const { return *this; } template Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cdab() const { return Mem::shuffle128(data(), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::badc() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcad() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcda() const { return Mem::shuffle(data(), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dabc() const { return Mem::shuffle(Mem::shuffle128(data(), data()), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::acbd() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dbca() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dcba() const { return cdab().badc(); } #define VC_SWIZZLES_16BIT_IMPL(T) \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } VC_SWIZZLES_16BIT_IMPL(short) VC_SWIZZLES_16BIT_IMPL(unsigned short) #undef VC_SWIZZLES_16BIT_IMPL /////////////////////////////////////////////////////////////////////////////////////////// // division {{{1 template inline Vector &Vector::operator/=(EntryType x) { if (HasVectorDivision) { return operator/=(Vector(x)); } for_all_vector_entries(i, d.m(i) /= x; ); return *this; } template template inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const { if (HasVectorDivision) { return operator/(Vector(x)); } Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x; ); return r; } // per default fall back to scalar division template inline Vector &Vector::operator/=(const Vector &x) { for_all_vector_entries(i, d.m(i) /= x.d.m(i); ); return *this; } template inline Vector Vc_PURE Vector::operator/(const Vector &x) const { Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x.d.m(i); ); return r; } // specialize division on type static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); return concat( _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) ); } template<> inline Vector &Vector::operator/=(const Vector &x) { d.v() = divInt(d.v(), x.d.v()); return *this; } template<> inline Vector Vc_PURE Vector::operator/(const Vector &x) const { return divInt(d.v(), x.d.v()); } static inline m256i Vc_CONST divUInt(param256i a, param256i b) { m256d loa = _mm256_cvtepi32_pd(lo128(a)); m256d hia = _mm256_cvtepi32_pd(hi128(a)); m256d lob = _mm256_cvtepi32_pd(lo128(b)); m256d hib = _mm256_cvtepi32_pd(hi128(b)); // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) // to get the right number back we have to add 2^32 where a >= 2^31 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and // we rather want the standard stuff fast // // there is one remaining problem: a >= 2^31 and b == 1 // in that case the return value would be 2^31 return avx_cast(_mm256_blendv_ps(avx_cast(concat( _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) )), avx_cast(a), avx_cast(concat( _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divUInt(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divUInt(d.v(), x.d.v()); } template static inline m128i Vc_CONST divShort(param128i a, param128i b) { const m256 r = _mm256_div_ps(StaticCastHelper::cast(a), StaticCastHelper::cast(b)); return StaticCastHelper::cast(r); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) { d.v() = _mm256_div_pd(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const { return _mm256_div_pd(d.v(), x.d.v()); } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #define OP_IMPL(T, symbol) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) \ { \ for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const \ { \ Vector r; \ for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ return r; \ } OP_IMPL(int, <<) OP_IMPL(int, >>) OP_IMPL(unsigned int, <<) OP_IMPL(unsigned int, >>) OP_IMPL(short, <<) OP_IMPL(short, >>) OP_IMPL(unsigned short, <<) OP_IMPL(unsigned short, >>) #undef OP_IMPL template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = VectorHelper::shiftRight(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return VectorHelper::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = VectorHelper::shiftLeft(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return VectorHelper::shiftLeft(d.v(), shift); } #define OP_IMPL(T, symbol, fun) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const { return Vector(HV::fun(d.v(), x.d.v())); } OP_IMPL(int, &, and_) OP_IMPL(int, |, or_) OP_IMPL(int, ^, xor_) OP_IMPL(unsigned int, &, and_) OP_IMPL(unsigned int, |, or_) OP_IMPL(unsigned int, ^, xor_) OP_IMPL(short, &, and_) OP_IMPL(short, |, or_) OP_IMPL(short, ^, xor_) OP_IMPL(unsigned short, &, and_) OP_IMPL(unsigned short, |, or_) OP_IMPL(unsigned short, ^, xor_) OP_IMPL(float, &, and_) OP_IMPL(float, |, or_) OP_IMPL(float, ^, xor_) OP_IMPL(sfloat, &, and_) OP_IMPL(sfloat, |, or_) OP_IMPL(sfloat, ^, xor_) OP_IMPL(double, &, and_) OP_IMPL(double, |, or_) OP_IMPL(double, ^, xor_) #undef OP_IMPL // operators {{{1 #include "../common/operators.h" // isNegative {{{1 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { return Mem::permute(avx_cast( _mm256_srai_epi32(avx_cast(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) )); } // gathers {{{1 // Better implementation (hopefully) with _mm256_set_ //X template template Vector::Vector(const EntryType *mem, const Index *indexes) //X { //X for_all_vector_entries(int i, //X d.m(i) = mem[indexes[i]]; //X ); //X } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, member2, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, member2, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { gather(array, ptrMember1, outerIndexes, innerIndexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) : d(HT::zero()) { gather(array, ptrMember1, outerIndexes, innerIndexes, mask); } template struct IndexSizeChecker { static void check() {} }; template struct IndexSizeChecker, Size> { static void check() { VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); } }; template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } #ifdef VC_USE_SET_GATHERS template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) { IndexSizeChecker, Size>::check(); Vector indexesTmp = indexes; indexesTmp.setZero(!mask); (*this)(mask) = Vector(mem, indexesTmp); } #endif #ifdef VC_USE_BSF_GATHERS #define VC_MASKED_GATHER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits &= ~(1 << i); /* btr? */ \ d.m(i) = ith_value(i); \ } #elif defined(VC_USE_POPCNT_BSF_GATHERS) #define VC_MASKED_GATHER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (_mm_popcnt_u32(bits)) { \ case 8: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 6: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 4: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 2: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ case 1: \ low = _bit_scan_forward(bits); \ d.m(low) = ith_value(low); \ case 0: \ break; \ } #else #define VC_MASKED_GATHER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) d.m(i) = ith_value(i); \ ); #endif template template Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (mem[indexes[_i_]]) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) { IndexSizeChecker::check(); IndexSizeChecker::check(); #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_GATHER #undef ith_value } #undef VC_MASKED_GATHER #ifdef VC_USE_BSF_SCATTERS #define VC_MASKED_SCATTER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits ^= (1 << i); /* btr? */ \ ith_value(i) = d.m(i); \ } #elif defined(VC_USE_POPCNT_BSF_SCATTERS) #define VC_MASKED_SCATTER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (_mm_popcnt_u32(bits)) { \ case 8: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 6: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 4: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 2: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ case 1: \ low = _bit_scan_forward(bits); \ ith_value(low) = d.m(low); \ case 0: \ break; \ } #else #define VC_MASKED_SCATTER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) ith_value(i) = d.m(i); \ ); #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { for_all_vector_entries(i, mem[indexes[i]] = d.m(i); ); } #if defined(VC_MSVC) && VC_MSVC >= 170000000 // MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short template<> template Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { const unsigned int tmp = d.v()._d.m128i_u32[0]; mem[indexes[0]] = tmp & 0xffff; mem[indexes[1]] = tmp >> 16; mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); } template<> template Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { const unsigned int tmp = d.v()._d.m128i_u32[0]; mem[indexes[0]] = tmp & 0xffff; mem[indexes[1]] = tmp >> 16; mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); } #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const { #define ith_value(_i_) mem[indexes[_i_]] VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1).*(member2) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const { for_all_vector_entries(i, (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const { #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_SCATTER #undef ith_value } /////////////////////////////////////////////////////////////////////////////////////////// // operator- {{{1 template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_sign_epi16(d.v(), _mm_setallone_si128()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_sign_epi16(d.v(), _mm_setallone_si128()); } /////////////////////////////////////////////////////////////////////////////////////////// // horizontal ops {{{1 template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArg m) const { Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArg m) const { Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArg m) const { Vector tmp(VectorSpecialInitializerOne::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArg m) const { Vector tmp(VectorSpecialInitializerZero::Zero); tmp(m) = *this; return tmp.sum(); }//}}} // copySign {{{1 template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_ps( _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_ps( _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const { return _mm256_or_pd( _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()), _mm256_and_pd(d.v(), _mm256_setabsmask_pd()) ); }//}}}1 // exponent {{{1 template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vector Vector::exponent() const { VC_ASSERT((*this >= 0.).isFull()); return Internal::exponent(d.v()); } // }}}1 // Random {{{1 static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, Vector &state1) { state0.load(&Vc::RandomState[0]); state1.load(&Vc::RandomState[uint_v::Size]); (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); } template Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return state0.reinterpretCast >(); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { const m256i state = VectorHelper::load(&Vc::RandomState[0], Vc::Aligned); for (size_t k = 0; k < 8; k += 2) { typedef unsigned long long uint64 Vc_MAY_ALIAS; const uint64 stateX = *reinterpret_cast(&Vc::RandomState[k]); *reinterpret_cast(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11); } return (Vector(_cast(_mm256_srli_epi64(state, 12))) | One()) - One(); } // }}}1 // shifted / rotated {{{1 template struct VectorShift; template<> struct VectorShift<32, 4, m256d, double> { static Vc_INTRINSIC m256d shifted(param256d v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(double))); case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(double))); case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(double))); case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(double))); case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(double))); case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(double))); } return _mm256_setzero_pd(); } }; template struct VectorShift<32, 8, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(EntryType))); case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(EntryType))); case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(EntryType))); case 4: return avx_cast(_mm256_srli_si256(avx_cast(v), 4 * sizeof(EntryType))); case 5: return avx_cast(_mm256_srli_si256(avx_cast(v), 5 * sizeof(EntryType))); case 6: return avx_cast(_mm256_srli_si256(avx_cast(v), 6 * sizeof(EntryType))); case 7: return avx_cast(_mm256_srli_si256(avx_cast(v), 7 * sizeof(EntryType))); case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(EntryType))); case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(EntryType))); case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(EntryType))); case -4: return avx_cast(_mm256_slli_si256(avx_cast(v), 4 * sizeof(EntryType))); case -5: return avx_cast(_mm256_slli_si256(avx_cast(v), 5 * sizeof(EntryType))); case -6: return avx_cast(_mm256_slli_si256(avx_cast(v), 6 * sizeof(EntryType))); case -7: return avx_cast(_mm256_slli_si256(avx_cast(v), 7 * sizeof(EntryType))); } return avx_cast(_mm256_setzero_ps()); } }; template struct VectorShift<16, 8, VectorType, EntryType> { enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (amount) { case 0: return v; case 1: return avx_cast(_mm_srli_si128(avx_cast(v), 1 * EntryTypeSizeof)); case 2: return avx_cast(_mm_srli_si128(avx_cast(v), 2 * EntryTypeSizeof)); case 3: return avx_cast(_mm_srli_si128(avx_cast(v), 3 * EntryTypeSizeof)); case 4: return avx_cast(_mm_srli_si128(avx_cast(v), 4 * EntryTypeSizeof)); case 5: return avx_cast(_mm_srli_si128(avx_cast(v), 5 * EntryTypeSizeof)); case 6: return avx_cast(_mm_srli_si128(avx_cast(v), 6 * EntryTypeSizeof)); case 7: return avx_cast(_mm_srli_si128(avx_cast(v), 7 * EntryTypeSizeof)); case -1: return avx_cast(_mm_slli_si128(avx_cast(v), 1 * EntryTypeSizeof)); case -2: return avx_cast(_mm_slli_si128(avx_cast(v), 2 * EntryTypeSizeof)); case -3: return avx_cast(_mm_slli_si128(avx_cast(v), 3 * EntryTypeSizeof)); case -4: return avx_cast(_mm_slli_si128(avx_cast(v), 4 * EntryTypeSizeof)); case -5: return avx_cast(_mm_slli_si128(avx_cast(v), 5 * EntryTypeSizeof)); case -6: return avx_cast(_mm_slli_si128(avx_cast(v), 6 * EntryTypeSizeof)); case -7: return avx_cast(_mm_slli_si128(avx_cast(v), 7 * EntryTypeSizeof)); } return _mm_setzero_si128(); } }; template Vc_INTRINSIC Vector Vector::shifted(int amount) const { return VectorShift::shifted(d.v(), amount); } template struct VectorRotate; template struct VectorRotate<32, 4, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { const m128i vLo = avx_cast(lo128(v)); const m128i vHi = avx_cast(hi128(v)); switch (static_cast(amount) % 4) { case 0: return v; case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); case 2: return Mem::permute128(v); case 3: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); } return _mm256_setzero_pd(); } }; template struct VectorRotate<32, 8, VectorType, EntryType> { typedef typename SseVectorType::Type SmallV; enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { const m128i vLo = avx_cast(lo128(v)); const m128i vHi = avx_cast(hi128(v)); switch (static_cast(amount) % 8) { case 0: return v; case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); case 2: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof))); case 3: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof))); case 4: return Mem::permute128(v); case 5: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); case 6: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof))); case 7: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof))); } return avx_cast(_mm256_setzero_ps()); } }; template struct VectorRotate<16, 8, VectorType, EntryType> { enum { EntryTypeSizeof = sizeof(EntryType) }; static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) { switch (static_cast(amount) % 8) { case 0: return v; case 1: return avx_cast(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); case 2: return avx_cast(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); case 3: return avx_cast(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); case 4: return avx_cast(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); case 5: return avx_cast(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); case 6: return avx_cast(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); case 7: return avx_cast(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); } return _mm_setzero_si128(); } }; template Vc_INTRINSIC Vector Vector::rotated(int amount) const { return VectorRotate::rotated(d.v(), amount); /* const m128i v0 = avx_cast(d.v()[0]); const m128i v1 = avx_cast(d.v()[1]); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); case 2: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); case 3: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); case 4: return concat(d.v()[1], d.v()[0]); case 5: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); case 6: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); case 7: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); } */ } // }}}1 } // namespace AVX } // namespace Vc } // namespace ROOT #include "undomacros.h" // vim: foldmethod=marker