Ignore:
Timestamp:
Nov 2, 2018, 7:18:31 PM (6 months ago)
Author:
nmedfort
Message:

Initial version of PipelineKernel? + revised StreamSet? model.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r6094 r6184  
    1313
    1414using namespace llvm;
     15
     16inline size_t ceil_udiv(const size_t n, const size_t m) {
     17    return (n + m - 1) / m;
     18}
    1519
    1620namespace kernel {
     
    113117}
    114118
    115 FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    116 : MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    117                       {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
    118                           Binding{kb->getStreamSetTy(), "extractionMask"}},
    119119#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
    120                    {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     120FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
     121                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
     122                                         , StreamSet * outputStreamSet)
     123: MultiBlockKernel("fieldCompress" + std::to_string(b->getBitBlockWidth() / inputStreamSet->getNumElements()) + "_" + std::to_string(inputStreamSet->getNumElements()),
     124// inputs
     125{Binding{"inputStreamSet", inputStreamSet},
     126Binding{"extractionMask", extractionMask}},
     127// outputs
     128{Binding{"outputStreamSet", outputStreamSet}},
     129{}, {}, {})
     130, mCompressFieldWidth(b->getBitBlockWidth() / inputStreamSet->getNumElements())
     131, mStreamCount(inputStreamSet->getNumElements()) {
     132
     133}
    121134#else
    122                    {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
    123                        Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
     135FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
     136                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
     137                                         , StreamSet * outputStreamSet, StreamSet * unitCounts)
     138: MultiBlockKernel("fieldCompress" + std::to_string(b->getBitBlockWidth() / inputStreamSet->getNumElements()) + "_" + std::to_string(inputStreamSet->getNumElements()),
     139// inputs
     140{Binding{"inputStreamSet", inputStreamSet},
     141Binding{"extractionMask", extractionMask}},
     142// outputs
     143{Binding{"outputStreamSet", outputStreamSet},
     144Binding{"unitCounts", unitCounts, FixedRate(), RoundUpTo(b->getBitBlockWidth())}},
     145{}, {}, {})
     146, mCompressFieldWidth(b->getBitBlockWidth() / inputStreamSet->getNumElements())
     147, mStreamCount(inputStreamSet->getNumElements()) {
     148
     149}
    124150#endif
    125                       {}, {}, {})
    126 , mCompressFieldWidth(fieldWidth)
    127 , mStreamCount(streamCount) {
    128 }
    129151
    130152void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
     
    197219}
    198220   
    199 StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    200 : MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    201                    {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
     221StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb
     222                                           , StreamSet * source
     223                                           #ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
     224                                           , StreamSet * extractionMask
     225                                           #else
     226                                           , StreamSet * unitCounts
     227                                           #endif
     228                                           , StreamSet * compresedOutput
     229                                           , const unsigned FieldWidth)
     230: MultiBlockKernel("streamCompress" + std::to_string(FieldWidth) + "_" + std::to_string(source->getNumElements()),
     231{Binding{"sourceStreamSet", source},
    202232#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
    203                        Binding{kb->getStreamSetTy(), "extractionMask"}},
     233Binding{"extractionMask", extractionMask}},
    204234#else
    205                    Binding{kb->getStreamSetTy(), "unitCounts"}},
     235Binding{"unitCounts", unitCounts}},
    206236#endif
    207                    {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
    208                    {}, {}, {})
    209 , mCompressedFieldWidth(fieldWidth)
    210 , mStreamCount(streamCount) {
    211     addScalar(kb->getSizeTy(), "pendingItemCount");
    212     for (unsigned i = 0; i < streamCount; i++) {
    213         addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
     237{Binding{"compressedOutput", compresedOutput, BoundedRate(0, 1)}},
     238{}, {}, {})
     239, mCompressedFieldWidth(FieldWidth)
     240, mStreamCount(source->getNumElements()) {
     241    addInternalScalar(kb->getSizeTy(), "pendingItemCount");
     242    for (unsigned i = 0; i < mStreamCount; i++) {
     243        addInternalScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
    214244    }
    215245
     
    233263    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
    234264    Constant * const ZERO = b->getSize(0);
    235    
     265
    236266    Value * pendingItemCount = b->getScalarField("pendingItemCount");
    237267    std::vector<Value *> pendingData(mStreamCount);
     
    339369    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
    340370    for (unsigned i = 0; i < mStreamCount; i++) {
    341         Value * outputFwd = b->fwCast(fw, b->mvmd_sll(fw, outputFields[i], pendingFieldIdx));
    342         outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
    343         pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
     371        Value * shiftedField = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
     372
     373        Value * outputFwd = b->fwCast(fw, shiftedField);
     374        shiftedField = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
     375
     376        pendingOutput[i] = b->simd_or(pendingOutput[i], shiftedField);
    344377        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
    345378    }
     
    396429}
    397430
    398 SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
    399 : BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
    400                   {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
    401                   {}, {}, {}, {})
    402 , mStreamCount(streamCount)
    403 , mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
    404 // add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
    405 // to prevent rounding errors.
    406 , mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
    407 , mPEXTWidth(PEXT_width)
    408 {
    409     assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
    410         && "mDelCountFieldWidth must be a power of 2");
     431Bindings makeSwizzledDeleteByPEXTOutputBindings(const std::vector<StreamSet *> & outputStreamSets, const unsigned PEXTWidth) {
     432    const auto n = outputStreamSets.size();
     433    Bindings outputs;
     434    outputs.reserve(n);
     435    outputs.emplace_back("outputSwizzle0", outputStreamSets[0], PopcountOf("selectors"), BlockSize(PEXTWidth)); // PopcountOfNot("delMaskSet")
     436    for (unsigned i = 1; i < n; ++i) {
     437        outputs.emplace_back("outputSwizzle" + std::to_string(i), outputStreamSets[i], RateEqualTo("outputSwizzle0"), BlockSize(PEXTWidth));
     438    }
     439    return outputs;
     440}
     441
     442SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b,
     443                                                       StreamSet * selectors, StreamSet * inputStreamSet,
     444                                                       const std::vector<StreamSet *> & outputStreamSets,
     445                                                       const unsigned PEXTWidth)
     446
     447: MultiBlockKernel("PEXTdel" + std::to_string(PEXTWidth) + "_" + std::to_string(inputStreamSet->getNumElements()),
     448{Binding{"selectors", selectors}, Binding{"inputStreamSet", inputStreamSet}},
     449std::move(makeSwizzledDeleteByPEXTOutputBindings(outputStreamSets, PEXTWidth)),
     450{}, {}, {})
     451, mStreamCount(inputStreamSet->getNumElements())
     452, mSwizzleFactor(b->getBitBlockWidth() / PEXTWidth)
     453, mSwizzleSetCount(ceil_udiv(mStreamCount, mSwizzleFactor))
     454, mPEXTWidth(PEXTWidth) {
     455
     456    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0) && "mDelCountFieldWidth must be a power of 2");
    411457    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
    412458    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
    413 
    414     // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
    415     Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
    416 
    417     mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
    418     addScalar(b->getBitBlockType(), "pendingSwizzleData0");
    419     for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    420         mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
    421         addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    422     }
    423     addScalar(b->getSizeTy(), "pendingOffset");
    424 }
    425 
    426 void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     459    assert (mSwizzleSetCount);
     460    assert (outputStreamSets.size() == mSwizzleSetCount);
     461    assert (outputStreamSets[0]->getNumElements() == mSwizzleFactor);
     462
     463    addInternalScalar(b->getBitBlockType(), "pendingSwizzleData0");
     464    for (unsigned i = 1; i < outputStreamSets.size(); ++i) {
     465        assert (outputStreamSets[i]->getNumElements() == mSwizzleFactor);
     466        addInternalScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     467    }
     468}
     469
     470void SwizzledDeleteByPEXTkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
    427471    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
    428     Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
    429     generateProcessingLoop(b, delMask, false);
    430 }
    431 
    432 void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
    433     IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
    434     Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
    435     Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    436     Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
    437     generateProcessingLoop(b, delMask, true);
    438 }
    439 
    440 /*
    441 What this function does in pseudo code:
    442 for (mSwizzleFactor)
    443     create a swizzle set containing mSwizzleFactor blocks
    444     apply PEXT to each block in the swizzle set
    445     store the swizzleSet in PEXTedSwizzleSets vector
    446 
    447 for (each swizzle row i)
    448     for (each swizzle set j)
    449         processes row i in swizzle set j
    450         store output in pendingData[j]
    451 */
    452 
    453 void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
    454 
    455     // selectors marks the positions we want to keep
    456     Value * const selectors = b->CreateNot(delMask);
    457 
    458     const auto swizzleSets = makeSwizzleSets(b, selectors);
    459 
    460     // Compress the PEXTedSwizzleSets
    461     // Output is written and committed to the output buffer one swizzle at a time.
     472
     473    BasicBlock * const entry = b->GetInsertBlock();
     474    BasicBlock * const beginLoop = b->CreateBasicBlock("beginLoop");
     475
     476    ConstantInt * const ZERO = b->getSize(0);
    462477    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
    463478    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
     
    467482
    468483    // All output groups have the same count.
    469     Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
    470     outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
    471     Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
    472     Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
     484    Value * const baseOutputProduced = b->getProducedItemCount("outputSwizzle0");
     485    Value * const baseProducedOffset = b->CreateAnd(baseOutputProduced, BLOCK_WIDTH_MASK);
    473486
    474487    // There is a separate vector of pending data for each swizzle group.
    475     std::vector<Value *> pendingData;
     488    std::vector<Value *> pendingData(mSwizzleSetCount);
    476489    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    477         pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
    478     }
     490        pendingData[i] = b->getScalarField("pendingSwizzleData" + std::to_string(i));
     491    }
     492    b->CreateBr(beginLoop);
     493
     494    b->SetInsertPoint(beginLoop);
     495    PHINode * const strideIndex = b->CreatePHI(numOfBlocks->getType(), 2);
     496    strideIndex->addIncoming(ZERO, entry);
     497    PHINode * const producedOffsetPhi = b->CreatePHI(numOfBlocks->getType(), 2);
     498    producedOffsetPhi->addIncoming(baseProducedOffset, entry);
     499    std::vector<PHINode *> pendingDataPhi(mSwizzleSetCount);
     500    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     501        pendingDataPhi[i] = b->CreatePHI(pendingData[i]->getType(), 2);
     502        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
     503        pendingData[i] = pendingDataPhi[i];
     504    }
     505
     506    Value * const selectors = b->loadInputStreamBlock("selectors", strideIndex);
     507
     508    const auto swizzleSets = makeSwizzleSets(b, selectors, strideIndex);
    479509
    480510    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
     511
     512    // Compress the PEXTedSwizzleSets
     513    // Output is written and committed to the output buffer one swizzle at a time.
     514    Value * producedOffset = producedOffsetPhi;
    481515
    482516    // For each row i
     
    485519        // Generate code for each of the mSwizzleFactor fields making up a block.
    486520        // We load the count for the field and process all swizzle groups accordingly.
    487         Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
     521        Value * const pendingOffset = b->CreateAnd(producedOffset, PEXT_WIDTH_MASK);
    488522        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
    489523        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
    490524        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
    491525
     526        Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
     527        Value * const spaceVector = b->simd_fill(mPEXTWidth, pendingSpace);
     528
     529        Value * const outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
    492530        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
    493531        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
     
    498536            Value * const newItems = swizzleSets[j][i];
    499537            // Combine as many of the new items as possible into the pending group.
    500             Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
    501538            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
    502539            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
    503540            // To avoid an unpredictable branch, always store the combined group, whether full or not.
    504             Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
    505             b->CreateBlockAlignedStore(combinedGroup, outputPtr);
     541            b->storeOutputStreamBlock("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset, combinedGroup);
    506542            // Any items in excess of the space available in the current pending group overflow for the next group.
    507             Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
     543            Value * overFlowGroup = b->CreateLShr(newItems, spaceVector);
    508544            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
    509545            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
    510546        }
    511 
    512         Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
    513         outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
    514 
    515         outputProduced = b->CreateAdd(outputProduced, newItemCount);
    516     }
    517 
    518     if (flush) { // incase we selected the overflow group on the final iteration
    519         Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
    520         Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
    521         for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    522             Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
    523             b->CreateBlockAlignedStore(pendingData[i], outputPtr);
    524         }
    525     } else {
    526         for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    527             b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
    528         }
    529         Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
    530         b->setScalarField("pendingOffset", pendingOffset);
    531         // unless this is our final stride, don't report partially written fields.
    532         outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
    533     }
    534 
    535     b->setProducedItemCount("outputSwizzle0", outputProduced);
     547        producedOffset = b->CreateAdd(producedOffset, newItemCount);
     548    }
     549
     550    BasicBlock * const finishedLoop = b->CreateBasicBlock("finishedLoop");
     551    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
     552    BasicBlock * const loopEndBlock = b->GetInsertBlock();
     553    strideIndex->addIncoming(nextStrideIndex, loopEndBlock);
     554    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     555        pendingDataPhi[i]->addIncoming(pendingData[i], loopEndBlock);
     556    }
     557    producedOffsetPhi->addIncoming(producedOffset, loopEndBlock);
     558    Value * const doneLoop = b->CreateICmpEQ(nextStrideIndex, numOfBlocks);
     559
     560    b->CreateUnlikelyCondBr(doneLoop, finishedLoop, beginLoop);
     561
     562    b->SetInsertPoint(finishedLoop);
     563    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     564        b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
     565    }
    536566}
    537567
     
    581611*/
    582612
    583 std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
     613SwizzledDeleteByPEXTkernel::SwizzleSets SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors, Value * const strideIndex) {
    584614
    585615    Constant * pext = nullptr;
     
    598628    }
    599629
    600     std::vector<std::vector<Value *>> swizzleSets;
     630    SwizzleSets swizzleSets;
    601631    swizzleSets.reserve(mSwizzleSetCount);
    602632
     
    612642            const unsigned k = (i * mSwizzleFactor) + j;
    613643            if (k < mStreamCount) {
    614                 input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
     644                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k), strideIndex), vecTy);
    615645            } else {
    616646                input[j] = Constant::getNullValue(vecTy);
     
    633663            }
    634664        }
    635 
    636665        swizzleSets.emplace_back(output);
    637666    }
     
    639668    return swizzleSets;
    640669}
     670
    641671
    642672// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
     
    694724, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
    695725, mPEXTWidth(PEXT_width) {
    696     mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
    697     mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
     726    mOutputStreamSets.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
     727    mOutputStreamSets.emplace_back(b->getStreamSetTy(), "deletionCounts");
    698728}
    699729
     
    719749                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
    720750, mBitStreamCount(bitStreamCount)
    721     , mFieldWidth(fieldWidth)
    722     , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
    723     , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
    724         assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
    725         assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
    726         mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
    727         mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    728         addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
    729         for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    730             mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
    731             mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    732             addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    733         }
    734         addScalar(kb->getSizeTy(), "pendingOffset");
     751, mFieldWidth(fieldWidth)
     752, mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
     753, mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
     754    assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
     755    assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
     756    mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
     757    mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
     758    addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData0");
     759    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
     760        mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
     761        mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
     762        addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     763    }
     764    addInternalScalar(kb->getSizeTy(), "pendingOffset");
    735765}
    736766   
     
    766796        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
    767797       
     798        Value * const fieldWidths = kb->simd_fill(mFieldWidth, pendingOffset);
     799
    768800        // Data from the ith swizzle pack of each group is processed
    769801        // according to the same newItemCount, pendingSpace, ...
     
    771803            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
    772804            // Combine as many of the new items as possible into the pending group.
    773             Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
     805            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, fieldWidths));
    774806            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
    775807            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
     
    783815    }
    784816    kb->setScalarField("pendingOffset", pendingOffset);
    785     Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
    786     Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
     817//    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
     818//    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
    787819    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    788820        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    789821    }
    790     kb->setProducedItemCount("outputSwizzle0", produced);
     822//    kb->setProducedItemCount("outputSwizzle0", produced);
    791823}
    792824
     
    799831    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
    800832    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
    801     Value * pendingOffset = kb->getScalarField("pendingOffset");
     833//    Value * pendingOffset = kb->getScalarField("pendingOffset");
    802834
    803835    // Write the pending data.
     
    807839        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
    808840    }
    809     kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
    810 }
    811    
    812 void StreamFilterCompiler::makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
    813     if (mBufferBlocks == 0) {
    814         llvm::report_fatal_error("StreamFilterCompiler needs a non-zero bufferBlocks parameter (for now).");
    815     }
    816     auto & iBuilder = mDriver.getBuilder();
    817     unsigned N = IDISA::getNumOfStreams(ssType);
    818     if (IDISA::getStreamFieldWidth(ssType) != 1) {
    819         llvm::report_fatal_error("StreamFilterCompiler only compresses bit streams (for now)");
    820     }
    821     Kernel * compressK = nullptr;
    822     if (AVX2_available()) {
    823         compressK = mDriver.addKernelInstance<PEXTFieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
    824     } else {
    825         compressK = mDriver.addKernelInstance<FieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
    826     }
    827     parabix::StreamSetBuffer * compressedFields = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
    828 #ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
    829     parabix::StreamSetBuffer * unitCounts = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(), mBufferBlocks);
    830    
    831     mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields, unitCounts});
    832    
    833     Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
    834     mDriver.makeKernelCall(streamK, {compressedFields, unitCounts}, {outputs});
    835 #else
    836     mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields});
    837    
    838     Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
    839     mDriver.makeKernelCall(streamK, {compressedFields, mask}, {outputs});
    840 #endif
    841 }
    842 
    843 
    844 }
     841//    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
     842}
     843
     844}
Note: See TracChangeset for help on using the changeset viewer.