/* This file is part of the Vc library. Copyright (C) 2011-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "limits.h" #include "const.h" #include "macros.h" namespace ROOT { namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace AVX { /////////////////////////////////////////////////////////////////////////////////////////// // constants {{{1 template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : d(HV::load(IndexesFromZeroData::address(), Aligned)) {} template Vc_INTRINSIC Vector Vc_CONST Vector::Zero() { return HT::zero(); } template Vc_INTRINSIC Vector Vc_CONST Vector::One() { return HT::one(); } template Vc_INTRINSIC Vector Vc_CONST Vector::IndexesFromZero() { return HV::load(IndexesFromZeroData::address(), Aligned); } template template Vc_ALWAYS_INLINE Vector::Vector(VC_ALIGNED_PARAMETER(Vector) x) : d(StaticCastHelper::cast(x.data())) {} template Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(HT::set(x)) {} template<> Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} /////////////////////////////////////////////////////////////////////////////////////////// // load ctors {{{1 template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 template Vc_INTRINSIC void Vector::load(const EntryType *mem) { load(mem, Aligned); } template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) { d.v() = HV::load(mem, align); } template template Vc_INTRINSIC void Vector::load(const OtherT *mem) { load(mem, Aligned); } // LoadHelper {{{2 template struct LoadHelper; // float {{{2 template struct LoadHelper { static m256 load(const double *mem, Flags f) { return concat(_mm256_cvtpd_ps(VectorHelper::load(&mem[0], f)), _mm256_cvtpd_ps(VectorHelper::load(&mem[4], f))); } }; template struct LoadHelper { static m256 load(const unsigned int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const unsigned char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static m256 load(const signed char *mem, Flags f) { return StaticCastHelper::cast(LoadHelper::load(mem, f)); } }; template struct LoadHelper : public LoadHelper {}; // int {{{2 template struct LoadHelper { static m256i load(const unsigned int *mem, Flags f) { return VectorHelper::load(mem, f); } }; template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; template struct LoadHelper { static m256i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epi16 = _mm_cvtepi8_epi16(epi8); return StaticCastHelper::cast(epi16); } }; // unsigned int {{{2 template struct LoadHelper { static m256i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m256i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); const m128i epu16 = _mm_cvtepu8_epi16(epu8); return StaticCastHelper::cast(epu16); } }; // short {{{2 template struct LoadHelper { static m128i load(const unsigned short *mem, Flags f) { return StaticCastHelper::cast(VectorHelper::load(mem, f)); } }; template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; template struct LoadHelper { static m128i load(const signed char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepi8_epi16(epi8); } }; // unsigned short {{{2 template struct LoadHelper { static m128i load(const unsigned char *mem, Flags) { // the only available streaming load loads 16 bytes - twice as much as we need => can't use // it, or we risk an out-of-bounds read and an unaligned load exception const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); return _mm_cvtepu8_epi16(epu8); } }; // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) { d.v() = LoadHelper::load(x, f); } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = HV::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = HV::andnot_(avx_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm256_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) { data() = _mm256_or_ps(data(), k.data()); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template Vc_INTRINSIC void Vector::store(EntryType *mem) const { HV::store(mem, data(), Aligned); } template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const { HV::store(mem, data(), avx_cast(mask.data()), Aligned); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const { HV::store(mem, data(), align); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const { HV::store(mem, data(), avx_cast(mask.data()), align); } /////////////////////////////////////////////////////////////////////////////////////////// // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 template Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(a[0]) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) { } template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) { } template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0] = *this; } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = _mm256_cvtps_pd(lo128(d.v())); x[1].data() = _mm256_cvtps_pd(hi128(d.v())); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const { x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles {{{1 template Vc_INTRINSIC const Vector Vc_PURE &Vector::abcd() const { return *this; } template Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } template Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cdab() const { return Mem::shuffle128(data(), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::badc() const { return Mem::permute(data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcad() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcda() const { return Mem::shuffle(data(), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dabc() const { return Mem::shuffle(Mem::shuffle128(data(), data()), data()); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::acbd() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dbca() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dcba() const { return cdab().badc(); } #define VC_SWIZZLES_16BIT_IMPL(T) \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } VC_SWIZZLES_16BIT_IMPL(short) VC_SWIZZLES_16BIT_IMPL(unsigned short) #undef VC_SWIZZLES_16BIT_IMPL /////////////////////////////////////////////////////////////////////////////////////////// // division {{{1 template inline Vector &Vector::operator/=(EntryType x) { if (HasVectorDivision) { return operator/=(Vector(x)); } for_all_vector_entries(i, d.m(i) /= x; ); return *this; } template template inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const { if (HasVectorDivision) { return operator/(Vector(x)); } Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x; ); return r; } // per default fall back to scalar division template inline Vector &Vector::operator/=(const Vector &x) { for_all_vector_entries(i, d.m(i) /= x.d.m(i); ); return *this; } template inline Vector Vc_PURE Vector::operator/(const Vector &x) const { Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x.d.m(i); ); return r; } // specialize division on type static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); return concat( _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) ); } template<> inline Vector &Vector::operator/=(const Vector &x) { d.v() = divInt(d.v(), x.d.v()); return *this; } template<> inline Vector Vc_PURE Vector::operator/(const Vector &x) const { return divInt(d.v(), x.d.v()); } static inline m256i Vc_CONST divUInt(param256i a, param256i b) { m256d loa = _mm256_cvtepi32_pd(lo128(a)); m256d hia = _mm256_cvtepi32_pd(hi128(a)); m256d lob = _mm256_cvtepi32_pd(lo128(b)); m256d hib = _mm256_cvtepi32_pd(hi128(b)); // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) // to get the right number back we have to add 2^32 where a >= 2^31 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and // we rather want the standard stuff fast // // there is one remaining problem: a >= 2^31 and b == 1 // in that case the return value would be 2^31 return avx_cast(_mm256_blendv_ps(avx_cast(concat( _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) )), avx_cast(a), avx_cast(concat( _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divUInt(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divUInt(d.v(), x.d.v()); } template static inline m128i Vc_CONST divShort(param128i a, param128i b) { const m256 r = _mm256_div_ps(StaticCastHelper::cast(a), StaticCastHelper::cast(b)); return StaticCastHelper::cast(r); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = divShort(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const { return divShort(d.v(), x.d.v()); } template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) { d.v() = _mm256_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const { return _mm256_div_ps(d.v(), x.d.v()); } template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) { d.v() = _mm256_div_pd(d.v(), x.d.v()); return *this; } template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const { return _mm256_div_pd(d.v(), x.d.v()); } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #define OP_IMPL(T, symbol) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) \ { \ for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const \ { \ Vector r; \ for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ return r; \ } OP_IMPL(int, <<) OP_IMPL(int, >>) OP_IMPL(unsigned int, <<) OP_IMPL(unsigned int, >>) OP_IMPL(short, <<) OP_IMPL(short, >>) OP_IMPL(unsigned short, <<) OP_IMPL(unsigned short, >>) #undef OP_IMPL template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = VectorHelper::shiftRight(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return VectorHelper::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = VectorHelper::shiftLeft(d.v(), shift); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return VectorHelper::shiftLeft(d.v(), shift); } #define OP_IMPL(T, symbol, fun) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const { return Vector(HV::fun(d.v(), x.d.v())); } OP_IMPL(int, &, and_) OP_IMPL(int, |, or_) OP_IMPL(int, ^, xor_) OP_IMPL(unsigned int, &, and_) OP_IMPL(unsigned int, |, or_) OP_IMPL(unsigned int, ^, xor_) OP_IMPL(short, &, and_) OP_IMPL(short, |, or_) OP_IMPL(short, ^, xor_) OP_IMPL(unsigned short, &, and_) OP_IMPL(unsigned short, |, or_) OP_IMPL(unsigned short, ^, xor_) OP_IMPL(float, &, and_) OP_IMPL(float, |, or_) OP_IMPL(float, ^, xor_) OP_IMPL(sfloat, &, and_) OP_IMPL(sfloat, |, or_) OP_IMPL(sfloat, ^, xor_) OP_IMPL(double, &, and_) OP_IMPL(double, |, or_) OP_IMPL(double, ^, xor_) #undef OP_IMPL // operators {{{1 #include "../common/operators.h" // isNegative {{{1 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const { return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { return Mem::permute(avx_cast( _mm256_srai_epi32(avx_cast(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) )); } // gathers {{{1 // Better implementation (hopefully) with _mm256_set_ //X template template Vector::Vector(const EntryType *mem, const Index *indexes) //X { //X for_all_vector_entries(int i, //X d.m(i) = mem[indexes[i]]; //X ); //X } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, member2, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, member2, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { gather(array, ptrMember1, outerIndexes, innerIndexes); } template