Changeset 6013


Ignore:
Timestamp:
May 4, 2018, 3:27:50 PM (4 months ago)
Author:
cameron
Message:

StreamCompressKernel? initial check-in

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r6006 r6013  
    117117, mStreamCount(streamCount) {
    118118}
     119
     120StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
     121: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     122                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
     123                       Binding{kb->getStreamSetTy(), "unitCounts"}},
     124                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
     125                   {}, {}, {})
     126, mCompressedFieldWidth(fieldWidth)
     127, mStreamCount(streamCount) {
     128    addScalar(kb->getSizeTy(), "pendingItemCount");
     129    for (unsigned i = 0; i < streamCount; i++) {
     130        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
     131    }
     132
     133}
     134   
     135void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
     136    const unsigned fw = mCompressedFieldWidth;
     137    Type * fwTy = b->getIntNTy(fw);
     138    Type * sizeTy = b->getSizeTy();
     139    const unsigned numFields = b->getBitBlockWidth()/fw;
     140    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
     141    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
     142    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
     143    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
     144    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
     145    BasicBlock * entry = b->GetInsertBlock();
     146    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
     147    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
     148    Constant * const ZERO = b->getSize(0);
     149   
     150    Value * pendingItemCount = b->getScalarField("pendingItemCount");
     151    std::vector<Value *> pendingData(mStreamCount);
     152    for (unsigned i = 0; i < mStreamCount; i++) {
     153        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
     154    }
     155   
     156    b->CreateBr(segmentLoop);
     157    // Main Loop
     158    b->SetInsertPoint(segmentLoop);
     159    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
     160    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
     161    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
     162    PHINode * pendingDataPhi[mStreamCount];
     163    blockOffsetPhi->addIncoming(ZERO, entry);
     164    outputBlockPhi->addIncoming(ZERO, entry);
     165    pendingItemsPhi->addIncoming(pendingItemCount, entry);
     166    for (unsigned i = 0; i < mStreamCount; i++) {
     167        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
     168        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
     169    }
     170    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
     171    // For each field determine the (partial) sum popcount of all fields up to and
     172    // including the current field.
     173    Value * partialSum = fieldPopCounts;
     174    for (unsigned i = 1; i < numFields; i *= 2) {
     175        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
     176    }
     177    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
     178    //
     179    // Now determine for each source field the output offset of the first bit.
     180    // Note that this depends on the number of pending bits.
     181    //
     182    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
     183    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
     184    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
     185    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
     186    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
     187   //
     188    // Determine the relative field number for each output field.   Note that the total
     189    // number of fields involved is numFields + 1.   However, the first field always
     190    // be immediately combined into the current pending data field, so we calculate
     191    // field numbers for all subsequent fields, (the fields that receive overflow bits).
     192    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
     193  //
     194    // Now process the input data block of each stream in the input stream set.
     195    //
     196    // First load all the stream set blocks and the pending data.
     197    std::vector<Value *> sourceBlock(mStreamCount);
     198    for (unsigned i = 0; i < mStreamCount; i++) {
     199        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
     200
     201    }
     202    // Now separate the bits of each field into ones that go into the current field
     203    // and ones that go into the overflow field.   Extract the first field separately,
     204    // and then shift and combine subsequent fields.
     205    std::vector<Value *> pendingOutput(mStreamCount);
     206    std::vector<Value *> outputFields(mStreamCount);
     207    for (unsigned i = 0; i < mStreamCount; i++) {
     208        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
     209        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], b->simd_sub(fw, fwSplat, offsets));
     210        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
     211        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
     212        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
     213        // shift back currentFieldBits to combine with nextFieldBits.
     214        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
     215    }
     216    // We may have filled the current field of the pendingOutput update the
     217    // pendingFieldIndex.
     218    Value * newPendingTotal = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldPopCounts, 0), sizeTy);
     219    pendingFieldIdx = b->CreateUDiv(newPendingTotal, ConstantInt::get(sizeTy, fw));
     220    // Now combine forward all fields with the same field number.  This may require
     221    // up to log2 numFields steps.
     222    for (unsigned j = 1; j < numFields; j*=2) {
     223        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
     224        for (unsigned i = 0; i < mStreamCount; i++) {
     225            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
     226            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
     227        }
     228    }
     229    // Now compress the data fields, eliminating all but the last field from
     230    // each run of consecutive field having the same field number.
     231    // same field number as a subsequent field.
     232    Value * eqNext = b->simd_eq(fw, fieldNo, b->mvmd_srli(fw, fieldNo, 1));
     233    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
     234    for (unsigned i = 0; i < mStreamCount; i++) {
     235        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
     236   }
     237    //
     238    // Finally combine the pendingOutput and outputField data.
     239    // (a) shift forward outputField data to fill the pendingOutput values.
     240    // (b) shift back outputField data to clear data added to pendingOutput.
     241    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
     242    for (unsigned i = 0; i < mStreamCount; i++) {
     243        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
     244        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
     245        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
     246  }
     247    //
     248    // Write the pendingOutput data to outputStream.
     249    // Note: this data may be overwritten later, but we avoid branching.
     250    for (unsigned i = 0; i < mStreamCount; i++) {
     251        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
     252    }
     253    // Now determine the total amount of pending items and whether
     254    // the pending data all fits within the pendingOutput.
     255    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
     256    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
     257    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
     258    //
     259    // Prepare Phi nodes for the next iteration.
     260    //
     261    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
     262    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
     263    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
     264    // But don't advance the output if all the data does fit into pendingOutput.
     265    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
     266    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
     267    pendingItemsPhi->addIncoming(newPending, segmentLoop);
     268
     269    for (unsigned i = 0; i < mStreamCount; i++) {
     270        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
     271        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
     272    }
     273    //
     274    // Now continue the loop if there are more blocks to process.
     275    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
     276    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
     277   
     278    b->SetInsertPoint(segmentDone);
     279    // Save kernel state.
     280    b->setScalarField("pendingItemCount", newPending);
     281    for (unsigned i = 0; i < mStreamCount; i++) {
     282        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
     283    }
     284    Value * produced = b->getProducedItemCount("compressedOutput");
     285    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
     286    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, blockPopCount), produced);
     287    b->setProducedItemCount("compressedOutput", produced);
     288}
     289   
     290   
     291   
    119292
    120293SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r6006 r6013  
    3535};
    3636
    37     class FieldCompressKernel final : public MultiBlockKernel {
    38     public:
    39         FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
    40         bool isCachable() const override { return true; }
    41         bool hasSignature() const override { return false; }
    42     protected:
    43         void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
    44     private:
    45         const unsigned mCompressFieldWidth;
    46         const unsigned mStreamCount;
    47     };
    48    
     37class FieldCompressKernel final : public MultiBlockKernel {
     38public:
     39    FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     40    bool isCachable() const override { return true; }
     41    bool hasSignature() const override { return false; }
     42protected:
     43    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
     44private:
     45    const unsigned mCompressFieldWidth;
     46    const unsigned mStreamCount;
     47};
     48
     49//
     50//  Given streams that are compreseed within fields, produced fully
     51//  compressed streams.
     52class StreamCompressKernel final : public MultiBlockKernel {
     53public:
     54    StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     55    bool isCachable() const override { return true; }
     56    bool hasSignature() const override { return false; }
     57protected:
     58    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) override;
     59private:
     60    const unsigned mCompressedFieldWidth;
     61    const unsigned mStreamCount;
     62};
    4963
    5064/*
Note: See TracChangeset for help on using the changeset viewer.