Changeset 6006


Ignore:
Timestamp:
May 1, 2018, 10:41:13 AM (3 weeks ago)
Author:
cameron
Message:

Multiblock field compress kernel, used in u8u16

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r6004 r6006  
    11/*
    2  *  Copyright (c) 2017 International Characters.
     2 *  Copyright (c) 2018 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 */
     
    1212namespace kernel {
    1313
    14 inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
    15     Value * m = iBuilder->simd_not(del_mask);
    16     Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
     14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
     15    Value * m = kb->simd_not(del_mask);
     16    Value * mk = kb->simd_slli(fw, del_mask, 1);
    1717    std::vector<Value *> move_masks;
    1818    for (unsigned shift = 1; shift < fw; shift *= 2) {
    1919        Value * mp = mk;
    2020        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
    21             mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
    22         }
    23         Value * mv = iBuilder->simd_and(mp, m);
    24         m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
    25         mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
     21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
     22        }
     23        Value * mv = kb->simd_and(mp, m);
     24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
     25        mk = kb->simd_and(mk, kb->simd_not(mp));
    2626        move_masks.push_back(mv);
    2727    }
     
    2929}
    3030
    31 inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
    32     Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
     31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
     32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
    3333    for (unsigned i = 0; i < mv.size(); i++) {
    3434        unsigned shift = 1 << i;
    35         Value * t = iBuilder->simd_and(s, mv[i]);
    36         s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
     35        Value * t = kb->simd_and(s, mv[i]);
     36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
    3737    }
    3838    return s;
     
    4343// Outputs: the deleted streams, plus a partial sum popcount
    4444
    45 void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    46     Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    47     const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
     45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
     46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
     47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
    4848    for (unsigned j = 0; j < mStreamCount; ++j) {
    49         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    50         Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
    51         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    52     }
    53     Value * unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
    54     iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
    55 }
    56 
    57 void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
    58     IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
    59     Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
    60     Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    61     Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    62     const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
     49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
     50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
     51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
     52    }
     53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
     54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
     55}
     56
     57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
     58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
     59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
     60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
     62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
    6363    for (unsigned j = 0; j < mStreamCount; ++j) {
    64         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    65         Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
    66         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    67     }
    68     Value * const unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
    69     iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
    70 }
    71 
    72 DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
     64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
     65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
     66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
     67    }
     68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
     69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
     70}
     71
     72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    7373: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    74                       {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    75                           Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    76                       {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
    77                           Binding{iBuilder->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
     74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
     75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
     76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
     77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
    7878                      {}, {}, {})
    7979, mDeletionFieldWidth(fieldWidth)
     
    8181}
    8282
    83    
    84    
     83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
     84    BasicBlock * entry = kb->GetInsertBlock();
     85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
     86    BasicBlock * done = kb->CreateBasicBlock("done");
     87    Constant * const ZERO = kb->getSize(0);
     88    kb->CreateBr(processBlock);
     89    kb->SetInsertPoint(processBlock);
     90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     91    blockOffsetPhi->addIncoming(ZERO, entry);
     92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
     93    Value * delMask = kb->simd_not(extractionMask);
     94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
     95    for (unsigned j = 0; j < mStreamCount; ++j) {
     96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
     97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
     98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
     99    }
     100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
     101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
     102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
     103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
     104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
     105    kb->CreateCondBr(moreToDo, processBlock, done);
     106    kb->SetInsertPoint(done);
     107}
     108
     109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
     110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
     112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
     113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
     114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
     115                      {}, {}, {})
     116, mCompressFieldWidth(fieldWidth)
     117, mStreamCount(streamCount) {
     118}
     119
    85120SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
    86121: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
     
    331366// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
    332367
    333 void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    334     Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    335     generateProcessingLoop(iBuilder, delMask);
    336 }
    337 
    338 void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
    339     IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
    340     Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
    341     Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    342     Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    343     generateProcessingLoop(iBuilder, delMask);
    344 }
    345 
    346 void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, Value * delMask) {
     368void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
     369    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
     370    generateProcessingLoop(kb, delMask);
     371}
     372
     373void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
     374    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
     375    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
     376    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     377    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
     378    generateProcessingLoop(kb, delMask);
     379}
     380
     381void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
    347382    Constant * PEXT_func = nullptr;
    348383    if (mPEXTWidth == 64) {
    349         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     384        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
    350385    } else if (mPEXTWidth == 32) {
    351         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     386        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
    352387    }
    353388    std::vector<Value *> masks(mSwizzleFactor);
    354     Value * const m = iBuilder->fwCast(mSwizzleFactor, iBuilder->simd_not(delMask));
     389    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
    355390    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    356         masks.push_back(iBuilder->CreateExtractElement(m, i));
     391        masks.push_back(kb->CreateExtractElement(m, i));
    357392    }
    358393
    359394    for (unsigned i = 0; i < mStreamCount; ++i) {
    360         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i));
    361         Value * value = iBuilder->fwCast(mPEXTWidth, input);
     395        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
     396        Value * value = kb->fwCast(mPEXTWidth, input);
    362397        Value * output = UndefValue::get(value->getType());
    363398        for (unsigned j = 0; j < mSwizzleFactor; j++) {
    364             Value * field = iBuilder->CreateExtractElement(value, j);
    365             Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[j]});
    366             output = iBuilder->CreateInsertElement(output, compressed, j);
    367         }
    368         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(i), output);
    369     }
    370     Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    371     iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
     399            Value * field = kb->CreateExtractElement(value, j);
     400            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
     401            output = kb->CreateInsertElement(output, compressed, j);
     402        }
     403        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
     404    }
     405    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
     406    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
    372407}
    373408
     
    402437//
    403438
    404 SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
     439SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
    405440: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
    406                      {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
     441                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
    407442, mBitStreamCount(bitStreamCount)
    408443    , mFieldWidth(fieldWidth)
    409     , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
     444    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
    410445    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
    411446        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
    412447        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
    413         mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
    414         mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    415         addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
     448        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
     449        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
     450        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
    416451        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    417             mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
    418             mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    419             addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    420         }
    421         addScalar(iBuilder->getSizeTy(), "pendingOffset");
     452            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
     453            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
     454            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     455        }
     456        addScalar(kb->getSizeTy(), "pendingOffset");
    422457}
    423458   
    424 void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     459void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
    425460       
    426     Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
    427     Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
     461    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
     462    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
    428463   
    429464    // Output is written and committed to the output buffer one swizzle at a time.
    430465    //
    431     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    432     Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
     466    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
     467    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
    433468   
    434     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    435     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    436     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
     469    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     470    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
     471    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
    437472
    438473    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
    439     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     474    Value * pendingOffset = kb->getScalarField("pendingOffset");
    440475    // There is a separate vector of pending data for each swizzle group.
    441476    std::vector<Value *> pendingData;
    442477    std::vector<Value *> outputStreamPtr;
    443478    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    444         pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
    445         outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
     479        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
     480        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
    446481    }
    447482   
     
    449484    // We load the count for the field and process all swizzle groups accordingly.
    450485    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    451         Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
    452         Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
    453         Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     486        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
     487        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
     488        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
    454489       
    455490        // Data from the ith swizzle pack of each group is processed
    456491        // according to the same newItemCount, pendingSpace, ...
    457492        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    458             Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
     493            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
    459494            // Combine as many of the new items as possible into the pending group.
    460             Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
     495            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
    461496            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
    462             iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
     497            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
    463498            // Any items in excess of the space available in the current pending group overflow for the next group.
    464             Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
     499            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
    465500            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
    466             pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
    467         }
    468         outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
    469         pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
    470     }
    471     iBuilder->setScalarField("pendingOffset", pendingOffset);   
    472     Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
    473     Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
     501            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     502        }
     503        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
     504        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
     505    }
     506    kb->setScalarField("pendingOffset", pendingOffset);
     507    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
     508    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
    474509    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    475         iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    476     }
    477     iBuilder->setProducedItemCount("outputSwizzle0", produced);
    478 }
    479 
    480 void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
    481     CreateDoBlockMethodCall(iBuilder);
    482     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    483     Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
     510        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
     511    }
     512    kb->setProducedItemCount("outputSwizzle0", produced);
     513}
     514
     515void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
     516    CreateDoBlockMethodCall(kb);
     517    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
     518    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
    484519   
    485     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    486     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    487     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
    488     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     520    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     521    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
     522    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
     523    Value * pendingOffset = kb->getScalarField("pendingOffset");
    489524
    490525    // Write the pending data.
    491526    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    492         Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
    493         Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
    494         iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
    495     }
    496     iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
    497 }
    498 }
     527        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
     528        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
     529        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
     530    }
     531    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
     532}
     533}
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r6004 r6006  
    3434    const unsigned mStreamCount;
    3535};
     36
     37    class FieldCompressKernel final : public MultiBlockKernel {
     38    public:
     39        FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     40        bool isCachable() const override { return true; }
     41        bool hasSignature() const override { return false; }
     42    protected:
     43        void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
     44    private:
     45        const unsigned mCompressFieldWidth;
     46        const unsigned mStreamCount;
     47    };
     48   
    3649
    3750/*
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5985 r6006  
    254254        main.createAssign(main.createExtract(output, i + 8), u16_lo[i]);
    255255    }
    256     main.createAssign(main.createExtract(delmask_out, main.getInteger(0)), delmask);
     256    main.createAssign(main.createExtract(delmask_out, main.getInteger(0)), main.createInFile(main.createNot(delmask)));
    257257}
    258258
     
    333333    } else {
    334334        StreamSetBuffer * DeletionCounts = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), bufferSize);
    335         Kernel * delK = pxDriver.addKernelInstance<DeletionKernel>(iBuilder, iBuilder->getBitBlockWidth()/16, 16);
     335        Kernel * delK = pxDriver.addKernelInstance<FieldCompressKernel>(iBuilder, iBuilder->getBitBlockWidth()/16, 16);
    336336        pxDriver.makeKernelCall(delK, {u8bits, DelMask}, {u16bits, DeletionCounts});
    337337        Kernel * p2sk = pxDriver.addKernelInstance<P2S16KernelWithCompressedOutput>(iBuilder);
Note: See TracChangeset for help on using the changeset viewer.