Changeset 4957 for icGREP


Ignore:
Timestamp:
Mar 6, 2016, 8:30:36 PM (3 years ago)
Author:
cameron
Message:

Alternative transposition strategies with AVX2

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_avx_builder.cpp

    r4956 r4957  
    7070        Value * shufa = CreateShuffleVector(aVec, aVec, ConstantVector::get(Idxs));
    7171        Value * shufb = CreateShuffleVector(bVec, bVec, ConstantVector::get(Idxs));
    72         Value * pk = hsimd_packh(128, shufa, shufb);
    73         return pk;
     72        return hsimd_packh(mBitBlockWidth/2, shufa, shufb);
    7473    }
    7574    else {
     
    102101        Value * shufa = CreateShuffleVector(aVec, aVec, ConstantVector::get(Idxs));
    103102        Value * shufb = CreateShuffleVector(bVec, bVec, ConstantVector::get(Idxs));
    104         return hsimd_packl(128, shufa, shufb);
     103        return hsimd_packl(mBitBlockWidth/2, shufa, shufb);
    105104    }
    106105    else {
     
    112111    }
    113112}
    114 }
     113   
     114Value * IDISA_AVX2_Builder::esimd_mergeh(unsigned fw, Value * a, Value * b) {
     115    if ((fw == 128) && (mBitBlockWidth == 256)) {
     116        Value * vperm2i128func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_vperm2i128);
     117        return CreateCall3(vperm2i128func, fwCast(64, a), fwCast(64, b), getInt8(0x31));
     118    }
     119    unsigned field_count = mBitBlockWidth/fw;
     120    Value * aVec = fwCast(fw, a);
     121    Value * bVec = fwCast(fw, b);
     122    std::vector<Constant*> Idxs;
     123    for (unsigned i = field_count/2; i < field_count; i++) {
     124        Idxs.push_back(getInt32(i));    // selects elements from first reg.
     125        Idxs.push_back(getInt32(i + field_count)); // selects elements from second reg.
     126    }
     127    return CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     128}
     129
     130Value * IDISA_AVX2_Builder::esimd_mergel(unsigned fw, Value * a, Value * b) {
     131    if ((fw == 128) && (mBitBlockWidth == 256)) {
     132        Value * vperm2i128func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_vperm2i128);
     133        return CreateCall3(vperm2i128func, fwCast(64, a), fwCast(64, b), getInt8(0x20));
     134    }
     135    unsigned field_count = mBitBlockWidth/fw;
     136    Value * aVec = fwCast(fw, a);
     137    Value * bVec = fwCast(fw, b);
     138    std::vector<Constant*> Idxs;
     139    for (unsigned i = 0; i < field_count/2; i++) {
     140        Idxs.push_back(getInt32(i));    // selects elements from first reg.
     141        Idxs.push_back(getInt32(i + field_count)); // selects elements from second reg.
     142    }
     143    return CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     144}
     145
     146Value * IDISA_AVX2_Builder::hsimd_packl_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     147    if ((fw == 16)  && (lanes == 2)) {
     148        Value * vpackuswbfunc = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_packuswb);
     149        Value * a_low = fwCast(16, simd_and(a, simd_lomask(fw)));
     150        Value * b_low = fwCast(16, simd_and(b, simd_lomask(fw)));
     151        Value * pack = CreateCall2(vpackuswbfunc, a_low, b_low);
     152        return pack;
     153    }
     154    unsigned fw_out = fw/2;
     155    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     156    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     157    Value * aVec = fwCast(fw_out, a);
     158    Value * bVec = fwCast(fw_out, b);
     159    std::vector<Constant*> Idxs;
     160    for (unsigned lane = 0; lane < lanes; lane++) {
     161        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     162        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     163            Idxs.push_back(getInt32(first_field_in_lane + 2*i));
     164        }
     165        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     166            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i));
     167        }
     168    }
     169    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     170    return pack;
     171}
     172
     173Value * IDISA_AVX2_Builder::hsimd_packh_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     174    if ((fw == 16)  && (lanes == 2)) {
     175        Value * vpackuswbfunc = Intrinsic::getDeclaration(mMod, Intrinsic::x86_avx2_packuswb);
     176        Value * a_low = simd_srli(fw, a, fw/2);
     177        Value * b_low = simd_srli(fw, b, fw/2);
     178        Value * pack = CreateCall2(vpackuswbfunc, a_low, b_low);
     179        return pack;
     180    }
     181    unsigned fw_out = fw/2;
     182    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     183    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     184    Value * aVec = fwCast(fw_out, a);
     185    Value * bVec = fwCast(fw_out, b);
     186    std::vector<Constant*> Idxs;
     187    for (unsigned lane = 0; lane < lanes; lane++) {
     188        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     189        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     190            Idxs.push_back(getInt32(first_field_in_lane + 2*i));
     191        }
     192        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     193            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i));
     194        }
     195    }
     196    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     197    return pack;
     198}
     199   
     200}
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_avx_builder.h

    r4955 r4957  
    3030    Value * hsimd_packh(unsigned fw, Value * a, Value * b) override;
    3131    Value * hsimd_packl(unsigned fw, Value * a, Value * b) override;
     32    Value * esimd_mergeh(unsigned fw, Value * a, Value * b) override;
     33    Value * esimd_mergel(unsigned fw, Value * a, Value * b) override;
     34    Value * hsimd_packh_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) override;
     35    Value * hsimd_packl_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) override;
     36
    3237    ~IDISA_AVX2_Builder() {};
    3338};
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_builder.cpp

    r4943 r4957  
    154154Value * IDISA_Builder::simd_if(unsigned fw, Value * cond, Value * a, Value * b) {
    155155    if (fw == 1) {
     156        Value * a1 = bitCast(a);
     157        Value * b1 = bitCast(b);
    156158        Value * c = bitCast(cond);
    157         return CreateOr(CreateAnd(c, bitCast(a)), CreateAnd(CreateNot(c), bitCast(b)));
     159        return CreateOr(CreateAnd(a1, c), CreateAnd(CreateXor(c, b1), b1));
    158160    }
    159161    else {
     
    234236}
    235237
     238   
     239Value * IDISA_Builder::hsimd_packh_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     240    unsigned fw_out = fw/2;
     241    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     242    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     243    Value * aVec = fwCast(fw_out, a);
     244    Value * bVec = fwCast(fw_out, b);
     245    std::vector<Constant*> Idxs;
     246    for (unsigned lane = 0; lane < lanes; lane++) {
     247        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     248        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     249            Idxs.push_back(getInt32(first_field_in_lane + 2*i + 1));
     250        }
     251        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     252            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i + 1));
     253        }
     254    }
     255    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     256    return pack;
     257}
     258
     259Value * IDISA_Builder::hsimd_packl_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b) {
     260    unsigned fw_out = fw/2;
     261    unsigned fields_per_lane = mBitBlockWidth/(fw_out * lanes);
     262    unsigned field_offset_for_b = mBitBlockWidth/fw_out;
     263    Value * aVec = fwCast(fw_out, a);
     264    Value * bVec = fwCast(fw_out, b);
     265    std::vector<Constant*> Idxs;
     266    for (unsigned lane = 0; lane < lanes; lane++) {
     267        unsigned first_field_in_lane = lane * fields_per_lane; // every second field
     268        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     269            Idxs.push_back(getInt32(first_field_in_lane + 2*i));
     270        }
     271        for (unsigned i = 0; i < fields_per_lane/2; i++) {
     272            Idxs.push_back(getInt32(field_offset_for_b + first_field_in_lane + 2*i));
     273        }
     274    }
     275    Value * pack = CreateShuffleVector(aVec, bVec, ConstantVector::get(Idxs));
     276    return pack;
     277}
     278
     279   
    236280Value * IDISA_Builder::hsimd_signmask(unsigned fw, Value * a) {
    237281    Value * mask = CreateICmpSLT(fwCast(fw, a), ConstantAggregateZero::get(fwVectorType(fw)));
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_builder.h

    r4922 r4957  
    7070    virtual Value * hsimd_packh(unsigned fw, Value * a, Value * b);
    7171    virtual Value * hsimd_packl(unsigned fw, Value * a, Value * b);
     72    virtual Value * hsimd_packh_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b);
     73    virtual Value * hsimd_packl_in_lanes(unsigned lanes, unsigned fw, Value * a, Value * b);
     74
    7275    virtual Value * hsimd_signmask(unsigned fw, Value * a);
    7376
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

    r4939 r4957  
    77#include <iostream>
    88
     9const int PACK_LANES = 1;
    910
    1011void s2p_step(IDISA::IDISA_Builder * iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
    11     Value * t0 = iBuilder->hsimd_packh(16, s0, s1);
    12     Value * t1 = iBuilder->hsimd_packl(16, s0, s1);
     12    Value * t0 = nullptr;
     13    Value * t1 = nullptr;
     14    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
     15        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
     16        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
     17        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
     18        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
     19    }
     20    else {
     21        t0 = iBuilder->hsimd_packh(16, s0, s1);
     22        t1 = iBuilder->hsimd_packl(16, s0, s1);
     23    }
    1324    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
    1425    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
Note: See TracChangeset for help on using the changeset viewer.