Changeset 6017


Ignore:
Timestamp:
May 6, 2018, 7:28:51 AM (3 months ago)
Author:
cameron
Message:

mvmd_srl, mvmd_sll for AVX2 to improve StreamCompressKernel? performance

Location:
icGREP/icgrep-devel/icgrep/IR_Gen
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6016 r6017  
    287287}
    288288
     289llvm::Value * IDISA_AVX2_Builder::mvmd_srl(unsigned fw, llvm::Value * a, llvm::Value * shift) {
     290    // Intrinsic::x86_avx2_permd) allows an efficient implementation for field width 32.
     291    // Translate larger field widths to 32 bits.
     292    if (fw > 32) {
     293        return fwCast(fw, mvmd_srl(32, a, CreateMul(shift, ConstantInt::get(shift->getType(), fw/32))));
     294    }
     295    if ((mBitBlockWidth == 256) && (fw == 32)) {
     296        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
     297        const unsigned fieldCount = mBitBlockWidth/fw;
     298        Type * fieldTy = getIntNTy(fw);
     299        Constant * indexes[fieldCount];
     300        for (unsigned int i = 0; i < fieldCount; i++) {
     301            indexes[i] = ConstantInt::get(fieldTy, i);
     302        }
     303        Constant * indexVec = ConstantVector::get({indexes, fieldCount});
     304        Constant * fieldCountSplat = ConstantVector::getSplat(fieldCount, ConstantInt::get(fieldTy, fieldCount));
     305        Value * shiftSplat = simd_fill(fw, CreateZExtOrTrunc(shift, fieldTy));
     306        Value * permuteVec = CreateAdd(indexVec, shiftSplat);
     307        // Zero out fields that are above the max.
     308        permuteVec = simd_and(permuteVec, simd_ult(fw, permuteVec, fieldCountSplat));
     309        // Insert a zero value at position 0 (OK for shifts > 0)
     310        Value * a0 = mvmd_insert(fw, a, Constant::getNullValue(fieldTy), 0);
     311        Value * shifted = CreateCall(permuteFunc, {a0, permuteVec});
     312        return simd_if(1, simd_eq(fw, shiftSplat, allZeroes()), a, shifted);
     313    }
     314    return IDISA_Builder::mvmd_srl(fw, a, shift);
     315}
     316
     317llvm::Value * IDISA_AVX2_Builder::mvmd_sll(unsigned fw, llvm::Value * a, llvm::Value * shift) {
     318    // Intrinsic::x86_avx2_permd) allows an efficient implementation for field width 32.
     319    // Translate larger field widths to 32 bits.
     320    if (fw > 32) {
     321        return fwCast(fw, mvmd_srl(32, a, CreateMul(shift, ConstantInt::get(shift->getType(), fw/32))));
     322    }
     323    if ((mBitBlockWidth == 256) && (fw == 32)) {
     324        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
     325        const unsigned fieldCount = mBitBlockWidth/fw;
     326        Type * fieldTy = getIntNTy(fw);
     327        Constant * indexes[fieldCount];
     328        for (unsigned int i = 0; i < fieldCount; i++) {
     329            indexes[i] = ConstantInt::get(fieldTy, i);
     330        }
     331        Constant * indexVec = ConstantVector::get({indexes, fieldCount});
     332        Value * shiftSplat = simd_fill(fw, CreateZExtOrTrunc(shift, fieldTy));
     333        Value * permuteVec = CreateSub(indexVec, shiftSplat);
     334        // Negative indexes are for fields that must be zeroed.  Convert the
     335        // permute constant to an all ones value, that will select item 7.
     336        permuteVec = simd_or(permuteVec, simd_lt(fw, permuteVec, fwCast(fw, allZeroes())));
     337        // Insert a zero value at position 7 (OK for shifts > 0)
     338        Value * a0 = mvmd_insert(fw, a, Constant::getNullValue(fieldTy), 7);
     339        Value * shifted = CreateCall(permuteFunc, {a0, permuteVec});
     340        return simd_if(1, simd_eq(fw, shiftSplat, allZeroes()), a, shifted);
     341    }
     342    return IDISA_Builder::mvmd_sll(fw, a, shift);
     343}
     344
    289345llvm::Value * IDISA_AVX2_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
    290346    if (mBitBlockWidth == 256 && fw == 32) {
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.h

    r6016 r6017  
    4848    std::pair<llvm::Value *, llvm::Value *> bitblock_indexed_advance(llvm::Value * a, llvm::Value * index_strm, llvm::Value * shiftin, unsigned shift) override;
    4949    llvm::Value * hsimd_signmask(unsigned fw, llvm::Value * a) override;
     50    llvm::Value * mvmd_srl(unsigned fw, llvm::Value * a, llvm::Value * shift) override;
     51    llvm::Value * mvmd_sll(unsigned fw, llvm::Value * a, llvm::Value * shift) override;
    5052    llvm::Value * mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) override;
    5153    llvm::Value * simd_pext(unsigned fw, llvm::Value * v, llvm::Value * extract_mask) override;
Note: See TracChangeset for help on using the changeset viewer.