Changeset 6011


Ignore:
Timestamp:
May 2, 2018, 12:18:55 PM (4 months ago)
Author:
cameron
Message:

simd_pext and simd_pdep for AVX2

Location:
icGREP/icgrep-devel/icgrep/IR_Gen
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6009 r6011  
    166166}
    167167
     168Value * IDISA_AVX2_Builder::simd_pext(unsigned fieldwidth, Value * v, Value * extract_mask) {
     169    if ((fieldwidth == 64) || (fieldwidth == 32)) {
     170        Value * PEXT_f = (fieldwidth == 64) ? Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_64)
     171                                            : Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_32);
     172        const auto n = getBitBlockWidth() / fieldwidth;
     173        Value * result = UndefValue::get(fwVectorType(fieldwidth));
     174        for (unsigned i = 0; i < n; i++) {
     175            Value * v_i = mvmd_extract(fieldwidth, v, i);
     176            Value * mask_i = mvmd_extract(fieldwidth, extract_mask, i);
     177            Value * bits = CreateCall(PEXT_f, {v_i, mask_i});
     178            result = mvmd_insert(fieldwidth, result, bits, i);
     179        }
     180        return bitCast(result);
     181    }
     182    return IDISA_Builder::simd_pext(fieldwidth, v, extract_mask);
     183}
     184
     185Value * IDISA_AVX2_Builder::simd_pdep(unsigned fieldwidth, Value * v, Value * deposit_mask) {
     186    if ((fieldwidth == 64) || (fieldwidth == 32)) {
     187        Value * PDEP_f = (fieldwidth == 64) ? Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_64)
     188                                            : Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_32);
     189        const auto n = getBitBlockWidth() / fieldwidth;
     190        Value * result = UndefValue::get(fwVectorType(fieldwidth));
     191        for (unsigned i = 0; i < n; i++) {
     192            Value * v_i = mvmd_extract(fieldwidth, v, i);
     193            Value * mask_i = mvmd_extract(fieldwidth, deposit_mask, i);
     194            Value * bits = CreateCall(PDEP_f, {v_i, mask_i});
     195            result = mvmd_insert(fieldwidth, result, bits, i);
     196        }
     197        return bitCast(result);
     198    }
     199    return IDISA_Builder::simd_pdep(fieldwidth, v, deposit_mask);
     200}
     201
    168202std::pair<Value *, Value *> IDISA_AVX2_Builder::bitblock_indexed_advance(Value * strm, Value * index_strm, Value * shiftIn, unsigned shiftAmount) {
    169     Value * const popcount = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, getSizeTy());
    170     Value * PEXT_f = nullptr;
    171     Value * PDEP_f = nullptr;
    172203    const unsigned bitWidth = getSizeTy()->getBitWidth();
    173     if (bitWidth == 64) {
    174         PEXT_f = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_64);
    175         PDEP_f = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_64);
    176     } else if ((bitWidth == 32)  && (shiftAmount < 32)) {
    177         PEXT_f = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_32);
    178         PDEP_f = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_32);
    179     } else {
    180         llvm::report_fatal_error("indexed_advance unsupported bit width");
    181     }
    182     Type * iBitBlock = getIntNTy(getBitBlockWidth());
    183     Value * shiftVal = getSize(shiftAmount);
    184     const auto n = getBitBlockWidth() / bitWidth;
    185     VectorType * const vecTy = VectorType::get(getSizeTy(), n);
    186     if (LLVM_LIKELY(shiftAmount < bitWidth)) {
    187         Value * carry = mvmd_extract(bitWidth, shiftIn, 0);
    188         Value * result = UndefValue::get(vecTy);
    189         for (unsigned i = 0; i < n; i++) {
    190             Value * s = mvmd_extract(bitWidth, strm, i);
    191             Value * ix = mvmd_extract(bitWidth, index_strm, i);
    192             Value * ix_popcnt = CreateCall(popcount, {ix});
    193             Value * bits = CreateCall(PEXT_f, {s, ix});
    194             Value * adv = CreateOr(CreateShl(bits, shiftAmount), carry);
    195             // We have two cases depending on whether the popcount of the index pack is < shiftAmount or not.
    196             Value * popcount_small = CreateICmpULT(ix_popcnt, shiftVal);
    197             Value * carry_if_popcount_small =
    198                 CreateOr(CreateShl(bits, CreateSub(shiftVal, ix_popcnt)),
    199                             CreateLShr(carry, ix_popcnt));
    200             Value * carry_if_popcount_large = CreateLShr(bits, CreateSub(ix_popcnt, shiftVal));
    201             carry = CreateSelect(popcount_small, carry_if_popcount_small, carry_if_popcount_large);
    202             result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {adv, ix}), i);
    203         }
    204         Value * carryOut = mvmd_insert(bitWidth, allZeroes(), carry, 0);
    205         return std::pair<Value *, Value *>{bitCast(carryOut), bitCast(result)};
    206     }
    207     else if (shiftAmount <= mBitBlockWidth) {
    208         // The shift amount is always greater than the popcount of the individual
    209         // elements that we deal with.   This simplifies some of the logic.
    210         Value * carry = CreateBitCast(shiftIn, iBitBlock);
    211         Value * result = UndefValue::get(vecTy);
    212         for (unsigned i = 0; i < n; i++) {
    213             Value * s = mvmd_extract(bitWidth, strm, i);
    214             Value * ix = mvmd_extract(bitWidth, index_strm, i);
    215             Value * ix_popcnt = CreateCall(popcount, {ix});
    216             Value * bits = CreateCall(PEXT_f, {s, ix});  // All these bits are shifted out (appended to carry).
    217             result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {mvmd_extract(bitWidth, carry, 0), ix}), i);
    218             carry = CreateLShr(carry, CreateZExt(ix_popcnt, iBitBlock)); // Remove the carry bits consumed, make room for new bits.
    219             carry = CreateOr(carry, CreateShl(CreateZExt(bits, iBitBlock), CreateZExt(CreateSub(shiftVal, ix_popcnt), iBitBlock)));
    220         }
    221         return std::pair<Value *, Value *>{bitCast(carry), bitCast(result)};
    222     }
    223     else {
    224         // The shift amount is greater than the total popcount.   We will consume popcount
    225         // bits from the shiftIn value only, and produce a carry out value of the selected bits.
    226         // elements that we deal with.   This simplifies some of the logic.
    227         Value * carry = CreateBitCast(shiftIn, iBitBlock);
    228         Value * result = UndefValue::get(vecTy);
    229         Value * carryOut = ConstantInt::getNullValue(iBitBlock);
    230         Value * generated = getSize(0);
    231         for (unsigned i = 0; i < n; i++) {
    232             Value * s = mvmd_extract(bitWidth, strm, i);
    233             Value * ix = mvmd_extract(bitWidth, index_strm, i);
    234             Value * ix_popcnt = CreateCall(popcount, {ix});
    235             Value * bits = CreateCall(PEXT_f, {s, ix});  // All these bits are shifted out (appended to carry).
    236             result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {mvmd_extract(bitWidth, carry, 0), ix}), i);
    237             carry = CreateLShr(carry, CreateZExt(ix_popcnt, iBitBlock)); // Remove the carry bits consumed.
    238             carryOut = CreateOr(carryOut, CreateShl(CreateZExt(bits, iBitBlock), CreateZExt(generated, iBitBlock)));
    239             generated = CreateAdd(generated, ix_popcnt);
    240         }
    241         return std::pair<Value *, Value *>{bitCast(carryOut), bitCast(result)};
    242     }
     204    if ((bitWidth == 64) || (bitWidth == 32)) {
     205        Value * PEXT_f = (bitWidth == 64) ? Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_64)
     206                                          : Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_32);
     207        Value * PDEP_f = (bitWidth == 64) ? Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_64)
     208                                          : Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_32);
     209        Value * const popcount = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, getSizeTy());
     210        Type * iBitBlock = getIntNTy(getBitBlockWidth());
     211        Value * shiftVal = getSize(shiftAmount);
     212        const auto n = getBitBlockWidth() / bitWidth;
     213        VectorType * const vecTy = VectorType::get(getSizeTy(), n);
     214        if (LLVM_LIKELY(shiftAmount < bitWidth)) {
     215            Value * carry = mvmd_extract(bitWidth, shiftIn, 0);
     216            Value * result = UndefValue::get(vecTy);
     217            for (unsigned i = 0; i < n; i++) {
     218                Value * s = mvmd_extract(bitWidth, strm, i);
     219                Value * ix = mvmd_extract(bitWidth, index_strm, i);
     220                Value * ix_popcnt = CreateCall(popcount, {ix});
     221                Value * bits = CreateCall(PEXT_f, {s, ix});
     222                Value * adv = CreateOr(CreateShl(bits, shiftAmount), carry);
     223                // We have two cases depending on whether the popcount of the index pack is < shiftAmount or not.
     224                Value * popcount_small = CreateICmpULT(ix_popcnt, shiftVal);
     225                Value * carry_if_popcount_small =
     226                    CreateOr(CreateShl(bits, CreateSub(shiftVal, ix_popcnt)),
     227                                CreateLShr(carry, ix_popcnt));
     228                Value * carry_if_popcount_large = CreateLShr(bits, CreateSub(ix_popcnt, shiftVal));
     229                carry = CreateSelect(popcount_small, carry_if_popcount_small, carry_if_popcount_large);
     230                result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {adv, ix}), i);
     231            }
     232            Value * carryOut = mvmd_insert(bitWidth, allZeroes(), carry, 0);
     233            return std::pair<Value *, Value *>{bitCast(carryOut), bitCast(result)};
     234        }
     235        else if (shiftAmount <= mBitBlockWidth) {
     236            // The shift amount is always greater than the popcount of the individual
     237            // elements that we deal with.   This simplifies some of the logic.
     238            Value * carry = CreateBitCast(shiftIn, iBitBlock);
     239            Value * result = UndefValue::get(vecTy);
     240            for (unsigned i = 0; i < n; i++) {
     241                Value * s = mvmd_extract(bitWidth, strm, i);
     242                Value * ix = mvmd_extract(bitWidth, index_strm, i);
     243                Value * ix_popcnt = CreateCall(popcount, {ix});
     244                Value * bits = CreateCall(PEXT_f, {s, ix});  // All these bits are shifted out (appended to carry).
     245                result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {mvmd_extract(bitWidth, carry, 0), ix}), i);
     246                carry = CreateLShr(carry, CreateZExt(ix_popcnt, iBitBlock)); // Remove the carry bits consumed, make room for new bits.
     247                carry = CreateOr(carry, CreateShl(CreateZExt(bits, iBitBlock), CreateZExt(CreateSub(shiftVal, ix_popcnt), iBitBlock)));
     248            }
     249            return std::pair<Value *, Value *>{bitCast(carry), bitCast(result)};
     250        }
     251        else {
     252            // The shift amount is greater than the total popcount.   We will consume popcount
     253            // bits from the shiftIn value only, and produce a carry out value of the selected bits.
     254            // elements that we deal with.   This simplifies some of the logic.
     255            Value * carry = CreateBitCast(shiftIn, iBitBlock);
     256            Value * result = UndefValue::get(vecTy);
     257            Value * carryOut = ConstantInt::getNullValue(iBitBlock);
     258            Value * generated = getSize(0);
     259            for (unsigned i = 0; i < n; i++) {
     260                Value * s = mvmd_extract(bitWidth, strm, i);
     261                Value * ix = mvmd_extract(bitWidth, index_strm, i);
     262                Value * ix_popcnt = CreateCall(popcount, {ix});
     263                Value * bits = CreateCall(PEXT_f, {s, ix});  // All these bits are shifted out (appended to carry).
     264                result = mvmd_insert(bitWidth, result, CreateCall(PDEP_f, {mvmd_extract(bitWidth, carry, 0), ix}), i);
     265                carry = CreateLShr(carry, CreateZExt(ix_popcnt, iBitBlock)); // Remove the carry bits consumed.
     266                carryOut = CreateOr(carryOut, CreateShl(CreateZExt(bits, iBitBlock), CreateZExt(generated, iBitBlock)));
     267                generated = CreateAdd(generated, ix_popcnt);
     268            }
     269            return std::pair<Value *, Value *>{bitCast(carryOut), bitCast(result)};
     270        }
     271    }
     272    return IDISA_Builder::bitblock_indexed_advance(strm, index_strm, shiftIn, shiftAmount);
    243273}
    244274
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.h

    r6009 r6011  
    4949    llvm::Value * hsimd_signmask(unsigned fw, llvm::Value * a) override;
    5050    llvm::Value * mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) override;
     51    llvm::Value * simd_pext(unsigned fw, llvm::Value * v, llvm::Value * extract_mask) override;
     52    llvm::Value * simd_pdep(unsigned fw, llvm::Value * v, llvm::Value * deposit_mask) override;
     53   
    5154
    5255    ~IDISA_AVX2_Builder() {}
Note: See TracChangeset for help on using the changeset viewer.