Ignore:
Timestamp:
Mar 6, 2016, 8:30:36 PM (3 years ago)
Author:
cameron
Message:

Alternative transposition strategies with AVX2

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_avx_builder.cpp

    r4956 r4957  
    7070        Value * shufa = CreateShuffleVector(aVec, aVec, ConstantVector::get(Idxs));
    7171        Value * shufb = CreateShuffleVector(bVec, bVec, ConstantVector::get(Idxs));
    72         Value * pk = hsimd_packh(128, shufa, shufb);
    73         return pk;
     72        return hsimd_packh(mBitBlockWidth/2, shufa, shufb);
    7473    }
    7574    else {
     
    102101        Value * shufa = CreateShuffleVector(aVec, aVec, ConstantVector::get(Idxs));
    103102        Value * shufb = CreateShuffleVector(bVec, bVec, ConstantVector::get(Idxs));
    104         return hsimd_packl(128, shufa, shufb);
     103        return hsimd_packl(mBitBlockWidth/2, shufa, shufb);
    105104    }
    106105    else {
     
    112111    }
    113112}
    114 }
     113   
     114Value * IDISA_AVX2_Builder::esimd_mergeh(unsigned fw, Value * a, Value * b) {
     115    if ((fw == 128) && (mBitBlockWidth == 256)) {
     116        Value * vperm2i128func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_vperm2i128);
     117        return CreateCall3(vperm2i128func, fwCast(64, a), fwCast(64, b), getInt8(0x31));
     118    }
     119    unsigned field_count = mBitBlockWidth/fw;
     120    Value * aVec = fwCast(fw, a);
     121    Value * bVec = fwCast(fw, b);
     122    std::vector<Constant*> Idxs;
     123    for (unsigned i = field_count/2; i < field_count; i++) {
     124        Idxs.push_back(getInt32(i));    // selects elements from first reg.
     125        Idxs.push_back(getInt32(i + field_count)); // selects elements from second reg.
     126    }
     127    return CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     128}
     129
     130Value * IDISA_AVX2_Builder::esimd_mergel(unsigned fw, Value * a, Value * b) {
     131    if ((fw == 128) && (mBitBlockWidth == 256)) {
     132        Value * vperm2i128func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_vperm2i128);
     133        return CreateCall3(vperm2i128func, fwCast(64, a), fwCast(64, b), getInt8(0x20));
     134    }
     135    unsigned field_count = mBitBlockWidth/fw;
     136    Value * aVec = fwCast(fw, a);
     137    Value * bVec = fwCast(fw, b);
     138    std::vector<Constant*> Idxs;
     139    for (unsigned i = 0; i < field_count/2; i++) {
     140        Idxs.push_back(getInt32(i));    // selects elements from first reg.
     141        Idxs.push_back(getInt32(i + field_count)); // selects elements from second reg.
     142    }
     143    return CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     144}
     145
     146Value * IDISA_AVX2_Builder::hsimd_packl_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     147    if ((fw == 16)  && (lanes == 2)) {
     148        Value * vpackuswbfunc = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_packuswb);
     149        Value * a_low = fwCast(16, simd_and(a, simd_lomask(fw)));
     150        Value * b_low = fwCast(16, simd_and(b, simd_lomask(fw)));
     151        Value * pack = CreateCall2(vpackuswbfunc, a_low, b_low);
     152        return pack;
     153    }
     154    unsigned fw_out = fw/2;
     155    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     156    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     157    Value * aVec = fwCast(fw_out, a);
     158    Value * bVec = fwCast(fw_out, b);
     159    std::vector<Constant*> Idxs;
     160    for (unsigned lane = 0; lane < lanes; lane++) {
     161        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     162        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     163            Idxs.push_back(getInt32(first_field_in_lane + 2*i));
     164        }
     165        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     166            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i));
     167        }
     168    }
     169    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     170    return pack;
     171}
     172
     173Value * IDISA_AVX2_Builder::hsimd_packh_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     174    if ((fw == 16)  && (lanes == 2)) {
     175        Value * vpackuswbfunc = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_packuswb);
     176        Value * a_low = simd_srli(fw, a, fw/2);
     177        Value * b_low = simd_srli(fw, b, fw/2);
     178        Value * pack = CreateCall2(vpackuswbfunc, a_low, b_low);
     179        return pack;
     180    }
     181    unsigned fw_out = fw/2;
     182    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     183    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     184    Value * aVec = fwCast(fw_out, a);
     185    Value * bVec = fwCast(fw_out, b);
     186    std::vector<Constant*> Idxs;
     187    for (unsigned lane = 0; lane < lanes; lane++) {
     188        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     189        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     190            Idxs.push_back(getInt32(first_field_in_lane + 2*i));
     191        }
     192        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     193            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i));
     194        }
     195    }
     196    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     197    return pack;
     198}
     199   
     200}
Note: See TracChangeset for help on using the changeset viewer.