Changeset 6108


Ignore:
Timestamp:
Jun 23, 2018, 8:12:13 AM (3 weeks ago)
Author:
cameron
Message:

IDISA builder progress

Location:
icGREP/icgrep-devel/icgrep/IR_Gen
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6103 r6108  
    100100
    101101Value * IDISA_AVX2_Builder::esimd_mergeh(unsigned fw, Value * a, Value * b) {
    102     if ((fw == 1) || (fw == 2)) {
    103         // Bit interleave using shuffle.   
    104         Value * shufFn = Intrinsic::getDeclaration(getModule(),  Intrinsic::x86_avx2_pshuf_b);
    105         // Make a shuffle table that translates the lower 4 bits of each byte in
    106         // order to spread out the bits: xxxxdcba => .d.c.b.a
    107         // We use two copies of the table for the AVX2 _mm256_shuffle_epi8
    108         Constant * interleave_table = bit_interleave_byteshuffle_table(fw);
    109         // Merge the bytes.
    110         Value * byte_merge = esimd_mergeh(8, a, b);
    111         Value * low_bits = CreateCall(shufFn, {interleave_table,  fwCast(8, simd_and(byte_merge, simd_lomask(8)))});
    112         Value * high_bits = simd_slli(16, CreateCall(shufFn, {interleave_table, fwCast(8, simd_srli(8, byte_merge, 4))}), fw);
    113         // For each 16-bit field, interleave the low bits of the two bytes.
    114         low_bits = simd_or(simd_and(low_bits, simd_lomask(16)), simd_srli(16, low_bits, 8-fw));
    115         // For each 16-bit field, interleave the high bits of the two bytes.
    116         high_bits = simd_or(simd_and(high_bits, simd_himask(16)), simd_slli(16, high_bits, 8-fw));
    117         return simd_or(low_bits, high_bits);
    118     }
     102    if (getVectorBitWidth(a) == mNativeBitBlockWidth) {
     103        if ((fw == 1) || (fw == 2)) {
     104            // Bit interleave using shuffle.
     105            Value * shufFn = Intrinsic::getDeclaration(getModule(),  Intrinsic::x86_avx2_pshuf_b);
     106            // Make a shuffle table that translates the lower 4 bits of each byte in
     107            // order to spread out the bits: xxxxdcba => .d.c.b.a
     108            // We use two copies of the table for the AVX2 _mm256_shuffle_epi8
     109            Constant * interleave_table = bit_interleave_byteshuffle_table(fw);
     110            // Merge the bytes.
     111            Value * byte_merge = esimd_mergeh(8, a, b);
     112            Value * low_bits = CreateCall(shufFn, {interleave_table,  fwCast(8, simd_select_lo(8, byte_merge))});
     113            Value * high_bits = simd_slli(16, CreateCall(shufFn, {interleave_table, fwCast(8, simd_srli(8, byte_merge, 4))}), fw);
     114            // For each 16-bit field, interleave the low bits of the two bytes.
     115            low_bits = simd_or(simd_select_lo(16, low_bits), simd_srli(16, low_bits, 8-fw));
     116            // For each 16-bit field, interleave the high bits of the two bytes.
     117            high_bits = simd_or(simd_select_hi(16, high_bits), simd_slli(16, high_bits, 8-fw));
     118            return simd_or(low_bits, high_bits);
     119        }
    119120#if LLVM_VERSION_INTEGER < LLVM_VERSION_CODE(6, 0, 0)
    120     if ((fw == 128) && (mBitBlockWidth == 256)) {
    121         Value * vperm2i128func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_vperm2i128);
    122         return CreateCall(vperm2i128func, {fwCast(64, a), fwCast(64, b), getInt8(0x31)});
    123     }
     121        if (fw == 128) {
     122            Value * vperm2i128func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_vperm2i128);
     123            return CreateCall(vperm2i128func, {fwCast(64, a), fwCast(64, b), getInt8(0x31)});
     124        }
    124125#endif
     126    }
    125127    // Otherwise use default SSE logic.
    126128    return IDISA_SSE_Builder::esimd_mergeh(fw, a, b);
     
    128130
    129131Value * IDISA_AVX2_Builder::esimd_mergel(unsigned fw, Value * a, Value * b) {
    130     if ((fw == 1) || (fw == 2)) {
    131         // Bit interleave using shuffle.   
    132         Value * shufFn = Intrinsic::getDeclaration(getModule(),  Intrinsic::x86_avx2_pshuf_b);
    133         // Make a shuffle table that translates the lower 4 bits of each byte in
    134         // order to spread out the bits: xxxxdcba => .d.c.b.a
    135         // We use two copies of the table for the AVX2 _mm256_shuffle_epi8
    136         Constant * interleave_table = bit_interleave_byteshuffle_table(fw);
    137         // Merge the bytes.
    138         Value * byte_merge = esimd_mergel(8, a, b);
    139         Value * low_bits = CreateCall(shufFn, {interleave_table,  fwCast(8, simd_and(byte_merge, simd_lomask(8)))});
    140         Value * high_bits = simd_slli(16, CreateCall(shufFn, {interleave_table, fwCast(8, simd_srli(8, byte_merge, 4))}), fw);
    141         // For each 16-bit field, interleave the low bits of the two bytes.
    142         low_bits = simd_or(simd_and(low_bits, simd_lomask(16)), simd_srli(16, low_bits, 8-fw));
    143         // For each 16-bit field, interleave the high bits of the two bytes.
    144         high_bits = simd_or(simd_and(high_bits, simd_himask(16)), simd_slli(16, high_bits, 8-fw));
    145         return simd_or(low_bits, high_bits);
    146     }
    147 #if LLVM_VERSION_INTEGER < LLVM_VERSION_CODE(6, 0, 0)
    148     if ((fw == 128) && (mBitBlockWidth == 256)) {
    149         Value * vperm2i128func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_vperm2i128);
    150         return CreateCall(vperm2i128func, {fwCast(64, a), fwCast(64, b), getInt8(0x20)});
    151     }
    152 #endif
     132    if (getVectorBitWidth(a) == mNativeBitBlockWidth) {
     133        if ((fw == 1) || (fw == 2)) {
     134            // Bit interleave using shuffle.
     135            Value * shufFn = Intrinsic::getDeclaration(getModule(),  Intrinsic::x86_avx2_pshuf_b);
     136            // Make a shuffle table that translates the lower 4 bits of each byte in
     137            // order to spread out the bits: xxxxdcba => .d.c.b.a
     138            // We use two copies of the table for the AVX2 _mm256_shuffle_epi8
     139            Constant * interleave_table = bit_interleave_byteshuffle_table(fw);
     140            // Merge the bytes.
     141            Value * byte_merge = esimd_mergel(8, a, b);
     142            Value * low_bits = CreateCall(shufFn, {interleave_table,  fwCast(8, simd_select_lo(8, byte_merge))});
     143            Value * high_bits = simd_slli(16, CreateCall(shufFn, {interleave_table, fwCast(8, simd_srli(8, byte_merge, 4))}), fw);
     144            // For each 16-bit field, interleave the low bits of the two bytes.
     145            low_bits = simd_or(simd_select_lo(16, low_bits), simd_srli(16, low_bits, 8-fw));
     146            // For each 16-bit field, interleave the high bits of the two bytes.
     147            high_bits = simd_or(simd_select_hi(16, high_bits), simd_slli(16, high_bits, 8-fw));
     148            return simd_or(low_bits, high_bits);
     149        }
     150    #if LLVM_VERSION_INTEGER < LLVM_VERSION_CODE(6, 0, 0)
     151        if ((fw == 128) && (mBitBlockWidth == 256)) {
     152            Value * vperm2i128func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_vperm2i128);
     153            return CreateCall(vperm2i128func, {fwCast(64, a), fwCast(64, b), getInt8(0x20)});
     154        }
     155    #endif
     156    }
    153157    // Otherwise use default SSE logic.
    154158    return IDISA_SSE_Builder::esimd_mergel(fw, a, b);
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r6106 r6108  
    2020namespace IDISA {
    2121
     22unsigned getVectorBitWidth(Value * vec) {
     23    return cast<VectorType>(vec->getType())->getBitWidth();
     24}
     25   
    2226VectorType * IDISA_Builder::fwVectorType(const unsigned fw) {
    2327    return VectorType::get(getIntNTy(fw), mBitBlockWidth / fw);
     
    9094}
    9195
    92 unsigned getVectorBitWidth(Value * vec) {
    93     return cast<VectorType>(vec->getType())->getBitWidth();
     96Value * IDISA_Builder::simd_select_hi(unsigned fw, Value * a) {
     97    const unsigned vectorWidth = getVectorBitWidth(a);
     98    Constant * maskField = Constant::getIntegerValue(getIntNTy(fw), APInt::getHighBitsSet(fw, fw/2));
     99    return simd_and(a, ConstantVector::getSplat(vectorWidth/fw, maskField));
     100}
     101
     102Value * IDISA_Builder::simd_select_lo(unsigned fw, Value * a) {
     103    const unsigned vectorWidth = getVectorBitWidth(a);
     104    Constant * maskField = Constant::getIntegerValue(getIntNTy(fw), APInt::getLowBitsSet(fw, fw/2));
     105    return simd_and(a, ConstantVector::getSplat(vectorWidth/fw, maskField));
    94106}
    95107
     
    106118
    107119Value * IDISA_Builder::CreateHalfVectorHigh(Value * vec) {
    108     VectorType * const vecTy = cast<VectorType>(vec->getType());
    109     const unsigned fieldCount = vecTy->getNumElements();
    110     return CreateShuffleVector(vec, UndefValue::get(vecTy), getConstantVectorSequence(32, fieldCount/2, fieldCount-1));
     120    Value * v = fwCast(mLaneWidth, vec);
     121    const unsigned N = getVectorBitWidth(v)/mLaneWidth;
     122    return CreateShuffleVector(v, UndefValue::get(v->getType()), getConstantVectorSequence(32, N/2, N-1));
    111123}
    112124
    113125Value * IDISA_Builder::CreateHalfVectorLow(Value * vec) {
    114     VectorType * const vecTy = cast<VectorType>(vec->getType());
    115     const unsigned fieldCount = vecTy->getNumElements();
    116     return CreateShuffleVector(vec, UndefValue::get(vecTy), getConstantVectorSequence(32, 0, fieldCount/2-1));
     126    Value * v = fwCast(mLaneWidth, vec);
     127    const unsigned N = getVectorBitWidth(v)/mLaneWidth;
     128    return CreateShuffleVector(v, UndefValue::get(v->getType()), getConstantVectorSequence(32, 0, N/2-1));
    117129}
    118130
    119131Value * IDISA_Builder::CreateDoubleVector(Value * lo, Value * hi) {
    120     VectorType * const vecTy = cast<VectorType>(lo->getType());
    121     const unsigned fieldCount = vecTy->getNumElements();
    122     return CreateShuffleVector(lo, hi, getConstantVectorSequence(32, 0, fieldCount*2-1));
     132    const unsigned N = getVectorBitWidth(lo)/mLaneWidth;
     133    return CreateShuffleVector(fwCast(mLaneWidth, lo), fwCast(mLaneWidth, hi), getConstantVectorSequence(32, 0, 2*N-1));
    123134}
    124135
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.h

    r6106 r6108  
    33
    44/*
    5  *  Copyright (c) 2015 International Characters.
     5 *  Copyright (c) 2018 International Characters.
    66 *  This software is licensed to the public under the Open Software License 3.0.
    77 *  icgrep is a trademark of International Characters.
     
    3737}
    3838
     39unsigned getVectorBitWidth(llvm::Value * vec);
    3940   
    4041class IDISA_Builder : public CBuilder {
     
    105106    llvm::Constant * simd_lomask(unsigned fw);
    106107   
     108    llvm::Value * simd_select_hi(unsigned fw, llvm::Value * a);
     109    llvm::Value * simd_select_lo(unsigned fw, llvm::Value * a);
     110
    107111    virtual llvm::Value * simd_fill(unsigned fw, llvm::Value * a);
    108112
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_sse_builder.cpp

    r6094 r6108  
    116116#ifndef LEAVE_CARRY_UNNORMALIZED
    117117    if (LLVM_UNLIKELY((shift % 8) == 0)) { // Use a single whole-byte shift, if possible.
    118         shifted = simd_or(mvmd_slli(8, a, shift / 8), si);
    119         shiftout = mvmd_srli(8, a, (mBitBlockWidth - shift) / 8);
     118        shifted = bitCast(simd_or(mvmd_slli(8, a, shift / 8), si));
     119        shiftout = bitCast(mvmd_srli(8, a, (mBitBlockWidth - shift) / 8));
    120120        return std::pair<Value *, Value *>(shiftout, shifted);
    121121    }
Note: See TracChangeset for help on using the changeset viewer.