Changeset 5355


Ignore:
Timestamp:
Mar 2, 2017, 12:18:43 PM (2 years ago)
Author:
cameron
Message:

Swizzled bitstream deletion and -enable-AVX-deletion in u8u16

Location:
icGREP/icgrep-devel/icgrep
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5344 r5355  
    6262static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
    6363
    64 static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
     64static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
    6565static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
    6666
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5347 r5355  
    11/*
    2  *  Copyright (c) 2016 International Characters.
     2 *  Copyright (c) 2017 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 */
     
    9393}
    9494
    95    
     95
     96
    9697const unsigned PEXT_width = 64;
    9798
     
    134135        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    135136    }
    136     Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     137    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     138    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    137139    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    138140}
     
    149151        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    150152    }
    151     Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     153    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     154    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    152155    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    153156}
     
    165168}
    166169
    167 }
     170   
     171//
     172// This kernel performs final stream compression for a set of N bitstreams, given
     173// (a) a set of bitstreams partially compressed within K-bit fields and stored
     174//     in K-bit swizzled form, and
     175// (b) a stream of deletion/extraction counts per K-bit stride.
     176//
     177// Restrictions:  At present, only K=64 is supported.
     178//                At present, N must be an exact multiple of BLOCK_SIZE/K.
     179//
     180// The kernel always consumes full blocks of input and emits data into the output
     181// buffer in swizzles of K items at a time.   Upon completion of a segment,
     182// up to K-1 pending output items per stream may be stored in the kernel state.
     183//
     184// Note: that both input streams and output streams are stored in swizzled form.
     185//
     186
     187   
     188
     189SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(IDISA::IDISA_Builder * iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
     190    : BlockOrientedKernel(iBuilder, "swizzled_compress", {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
     191, mBitStreamCount(bitStreamCount)
     192    , mFieldWidth(fieldWidth)
     193    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
     194    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
     195        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
     196        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
     197        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     198            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
     199            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), MaxRatio(1)});
     200            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     201        }
     202        addScalar(iBuilder->getSizeTy(), "pendingOffset");
     203}
     204
     205   
     206void SwizzledBitstreamCompressByCount::generateDoBlockMethod() {
     207       
     208    Value * countStreamPtr = iBuilder->CreateBitCast(getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0)), iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
     209   
     210    // Output is written and committed to the output buffer one swizzle at a time.
     211    //
     212    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
     213    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
     214   
     215    Value * outputProduced = getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     216    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
     217    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
     218
     219    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
     220    Value * pendingOffset = getScalarField("pendingOffset");
     221    // There is a separate vector of pending data for each swizzle group.
     222    std::vector<Value *> pendingData;
     223    std::vector<Value *> outputStreamPtr;
     224    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     225        pendingData.push_back(getScalarField("pendingSwizzleData" + std::to_string(i)));
     226        outputStreamPtr.push_back(getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
     227    }
     228   
     229    // Generate code for each of the mSwizzleFactor fields making up a block.
     230    // We load the count for the field and process all swizzle groups accordingly.
     231    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     232        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
     233        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
     234        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     235       
     236        // Data from the ith swizzle pack of each group is processed
     237        // according to the same newItemCount, pendingSpace, ...
     238        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     239            Value * newItems = loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
     240            // Combine as many of the new items as possible into the pending group.
     241            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
     242            // To avoid an unpredictable branch, always store the combined group, whether full or not.
     243               
     244            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
     245            // Any items in excess of the space available in the current pending group overflow for the next group.
     246            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
     247            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
     248            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     249        }
     250        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
     251        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
     252    }
     253    setScalarField("pendingOffset", pendingOffset);
     254   
     255    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
     256    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
     257    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     258        setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
     259        setProducedItemCount("outputSwizzle" + std::to_string(j), produced);
     260    }
     261}
     262
     263void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(Value * remainingBytes) {
     264    CreateDoBlockMethodCall();
     265    // All the data has been written.  Update the count to include pending data.
     266    Value * pendingOffset = getScalarField("pendingOffset");
     267    Value * produced = iBuilder->CreateAdd(pendingOffset, getProducedItemCount("outputSwizzle0"));
     268    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     269        setProducedItemCount("outputSwizzle" + std::to_string(j), produced);
     270    }
     271}
     272}
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r5313 r5355  
    5353    const unsigned mStreamCount;
    5454};
     55   
     56class SwizzledBitstreamCompressByCount : public BlockOrientedKernel {
     57public:
     58   
     59    SwizzledBitstreamCompressByCount(IDISA::IDISA_Builder * iBuilder, unsigned bitStreamCount, unsigned fieldWidth = 64);
     60   
     61protected:
     62   
     63    void generateDoBlockMethod() override;
     64    void generateFinalBlockMethod(llvm::Value * remainingBytes) override;
     65   
     66private:
     67    const unsigned mBitStreamCount;
     68    const unsigned mFieldWidth;
     69    const unsigned mSwizzleFactor;
     70    const unsigned mSwizzleSetCount;
     71};
     72
     73   
    5574}
    5675   
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5353 r5355  
    264264
    265265void KernelBuilder::setProducedItemCount(Value * instance, const std::string & name, Value * value) const {
     266    //iBuilder->CallPrintInt(mKernelName + "_" + name + "_produced_count", value);
    266267    setScalarField(instance, name + PRODUCED_ITEM_COUNT_SUFFIX, value);
    267268}
     
    568569    std::vector<Value *> priorProduced;
    569570    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    570         if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
     571        if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]) || isa<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
    571572            priorProduced.push_back(getProducedItemCount(mStreamSetOutputs[i].name));
    572573        }
     
    577578    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    578579        unsigned priorIdx = 0;
     580        Value * log2BlockSize = iBuilder->getSize(std::log2(iBuilder->getBitBlockWidth()));
     581        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
     582            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
     583            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
     584            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
     585            Value * priorBlock = iBuilder->CreateLShr(priorProduced[priorIdx], log2BlockSize);
     586            Value * priorOffset = iBuilder->CreateAnd(priorProduced[priorIdx], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
     587            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(priorBlock);
     588            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
     589            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
     590            iBuilder->CreateCondBr(wraparound, copyBack, done);
     591            iBuilder->SetInsertPoint(copyBack);
     592            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
     593            cb->createCopyBack(getStreamSetBufferPtr(mStreamSetOutputs[i].name), copyItems);
     594            iBuilder->CreateBr(done);
     595            iBuilder->SetInsertPoint(done);
     596            priorIdx++;
     597        }
    579598        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
    580599            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5353 r5355  
    111111}
    112112
     113Value * StreamSetBuffer::getLinearlyAccessibleBlocks(llvm::Value * fromBlock) const {
     114    Constant * bufBlocks = iBuilder->getSize(mBufferBlocks);
     115    return iBuilder->CreateSub(bufBlocks, iBuilder->CreateURem(fromBlock, bufBlocks));
     116}
     117
    113118
    114119// Single Block Buffer
     
    153158
    154159void CircularCopybackBuffer::createCopyBack(Value * self, Value * overFlowItems) const {
     160    Type * size_ty = iBuilder->getSizeTy();
     161    Type * i8ptr = iBuilder->getInt8PtrTy();
     162    Constant * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
    155163    Function * f = iBuilder->GetInsertBlock()->getParent();
    156164    BasicBlock * wholeBlockCopy = BasicBlock::Create(iBuilder->getContext(), "wholeBlockCopy", f, 0);
    157165    BasicBlock * partialBlockCopy = BasicBlock::Create(iBuilder->getContext(), "partialBlockCopy", f, 0);
    158166    BasicBlock * copyBackDone = BasicBlock::Create(iBuilder->getContext(), "copyBackDone", f, 0);
    159     Type * i8ptr = iBuilder->getInt8PtrTy();
    160167    unsigned numStreams = getType()->getArrayNumElements();
    161168    auto elemTy = getType()->getArrayElementType();
    162169    unsigned fieldWidth = isa<ArrayType>(elemTy) ? elemTy->getArrayNumElements() : 1;
    163     Constant * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
    164170    Value * overFlowAreaPtr = iBuilder->CreateGEP(self, iBuilder->getSize(mBufferBlocks));
    165171    Value * overFlowBlocks = iBuilder->CreateUDiv(overFlowItems, blockSize);
    166172    Value * partialItems = iBuilder->CreateURem(overFlowItems, blockSize);
     173    Value * partialBlockTargetPtr = iBuilder->CreateGEP(self, overFlowBlocks);
     174    Value * partialBlockSourcePtr = iBuilder->CreateGEP(overFlowAreaPtr, overFlowBlocks);
    167175    iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(overFlowBlocks, iBuilder->getSize(0)), wholeBlockCopy, partialBlockCopy);
    168176    iBuilder->SetInsertPoint(wholeBlockCopy);
    169177    unsigned alignment = iBuilder->getBitBlockWidth() / 8;
    170     Constant * blockBytes = iBuilder->getSize(fieldWidth * iBuilder->getBitBlockWidth()/8);
    171     Value * copyLength = iBuilder->CreateMul(overFlowBlocks, blockBytes);
     178    Value * copyLength = iBuilder->CreateSub(iBuilder->CreatePtrToInt(partialBlockTargetPtr, size_ty), iBuilder->CreatePtrToInt(self, size_ty));
    172179    iBuilder->CreateMemMove(iBuilder->CreateBitCast(self, i8ptr), iBuilder->CreateBitCast(overFlowAreaPtr, i8ptr), copyLength, alignment);
    173180    iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(partialItems, iBuilder->getSize(0)), partialBlockCopy, copyBackDone);
    174181    iBuilder->SetInsertPoint(partialBlockCopy);
    175     Value * partialBlockTargetPtr = iBuilder->CreateGEP(self, overFlowBlocks);
    176     Value * partialBlockSourcePtr = iBuilder->CreateGEP(overFlowAreaPtr, overFlowBlocks);
    177182    Value * copyBits = iBuilder->CreateMul(overFlowItems, iBuilder->getSize(fieldWidth));
    178183    Value * copyBytes = iBuilder->CreateLShr(iBuilder->CreateAdd(copyBits, iBuilder->getSize(7)), iBuilder->getSize(3));
     
    189194    return iBuilder->CreateGEP(self, modByBufferBlocks(blockIndex));
    190195}
     196
     197// SwizzledCopybackBuffer Buffer
     198
     199void SwizzledCopybackBuffer::allocateBuffer() {
     200    mStreamSetBufferPtr = iBuilder->CreateCacheAlignedAlloca(getType(), iBuilder->getSize(mBufferBlocks + mOverflowBlocks));
     201}
     202
     203void SwizzledCopybackBuffer::createCopyBack(Value * self, Value * overFlowItems) const {
     204    Type * size_ty = iBuilder->getSizeTy();
     205    Type * i8ptr = iBuilder->getInt8PtrTy();
     206    Constant * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
     207    Function * f = iBuilder->GetInsertBlock()->getParent();
     208    BasicBlock * wholeBlockCopy = BasicBlock::Create(iBuilder->getContext(), "wholeBlockCopy", f, 0);
     209    BasicBlock * partialBlockCopy = BasicBlock::Create(iBuilder->getContext(), "partialBlockCopy", f, 0);
     210    BasicBlock * copyBackDone = BasicBlock::Create(iBuilder->getContext(), "copyBackDone", f, 0);
     211    unsigned numStreams = getType()->getArrayNumElements();
     212    unsigned swizzleFactor = iBuilder->getBitBlockWidth()/mFieldWidth;
     213    auto elemTy = getType()->getArrayElementType();
     214    unsigned fieldWidth = isa<ArrayType>(elemTy) ? elemTy->getArrayNumElements() : 1;
     215    Value * overFlowAreaPtr = iBuilder->CreateGEP(self, iBuilder->getSize(mBufferBlocks));
     216    Value * overFlowBlocks = iBuilder->CreateUDiv(overFlowItems, blockSize);
     217    Value * partialItems = iBuilder->CreateURem(overFlowItems, blockSize);
     218    Value * partialBlockTargetPtr = iBuilder->CreateGEP(self, overFlowBlocks);
     219    Value * partialBlockSourcePtr = iBuilder->CreateGEP(overFlowAreaPtr, overFlowBlocks);
     220    iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(overFlowBlocks, iBuilder->getSize(0)), wholeBlockCopy, partialBlockCopy);
     221    iBuilder->SetInsertPoint(wholeBlockCopy);
     222    unsigned alignment = iBuilder->getBitBlockWidth() / 8;
     223    Value * copyLength = iBuilder->CreateSub(iBuilder->CreatePtrToInt(partialBlockTargetPtr, size_ty), iBuilder->CreatePtrToInt(self, size_ty));
     224    iBuilder->CreateMemMove(iBuilder->CreateBitCast(self, i8ptr), iBuilder->CreateBitCast(overFlowAreaPtr, i8ptr), copyLength, alignment);
     225    iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(partialItems, iBuilder->getSize(0)), partialBlockCopy, copyBackDone);
     226    iBuilder->SetInsertPoint(partialBlockCopy);
     227    Value * copyBits = iBuilder->CreateMul(overFlowItems, iBuilder->getSize(fieldWidth * swizzleFactor));
     228    Value * copyBytes = iBuilder->CreateLShr(iBuilder->CreateAdd(copyBits, iBuilder->getSize(7)), iBuilder->getSize(3));
     229    for (unsigned strm = 0; strm < numStreams; strm += swizzleFactor) {
     230        Value * strmTargetPtr = iBuilder->CreateGEP(partialBlockTargetPtr, {iBuilder->getInt32(0), iBuilder->getInt32(strm)});
     231        Value * strmSourcePtr = iBuilder->CreateGEP(partialBlockSourcePtr, {iBuilder->getInt32(0), iBuilder->getInt32(strm)});
     232        iBuilder->CreateMemMove(iBuilder->CreateBitCast(strmTargetPtr, i8ptr), iBuilder->CreateBitCast(strmSourcePtr, i8ptr), copyBytes, alignment);
     233    }
     234    iBuilder->CreateBr(copyBackDone);
     235    iBuilder->SetInsertPoint(copyBackDone);
     236}
     237
     238Value * SwizzledCopybackBuffer::getStreamSetBlockPtr(Value * self, Value * blockIndex) const {
     239    assert (blockIndex->getType()->isIntegerTy());
     240   
     241    Value * offset = nullptr;
     242    if (mBufferBlocks == 1) {
     243        offset = ConstantInt::getNullValue(iBuilder->getSizeTy());
     244    } else if ((mBufferBlocks & (mBufferBlocks - 1)) == 0) { // is power of 2
     245        offset = iBuilder->CreateAnd(blockIndex, ConstantInt::get(blockIndex->getType(), mBufferBlocks - 1));
     246    } else {
     247        offset = iBuilder->CreateURem(blockIndex, ConstantInt::get(blockIndex->getType(), mBufferBlocks));
     248    }
     249    return iBuilder->CreateGEP(self, offset);
     250}
     251
     252SwizzledCopybackBuffer::SwizzledCopybackBuffer(IDISA::IDISA_Builder * b, llvm::Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned fieldwidth, unsigned AddressSpace)
     253: StreamSetBuffer(BufferKind::SwizzledCopybackBuffer, b, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace), mOverflowBlocks(overflowBlocks), mFieldWidth(fieldwidth) {
     254   
     255}
     256
     257
    191258
    192259// Expandable Buffer
  • icGREP/icgrep-devel/icgrep/kernels/streamset.h

    r5353 r5355  
    1919public:
    2020
    21     enum class BufferKind : unsigned {BlockBuffer, ExternalFileBuffer, CircularBuffer, CircularCopybackBuffer, ExpandableBuffer};
     21    enum class BufferKind : unsigned {BlockBuffer, ExternalFileBuffer, CircularBuffer, CircularCopybackBuffer, SwizzledCopybackBuffer, ExpandableBuffer};
    2222
    2323    BufferKind getBufferKind() const {
     
    5757    // The number of items that cam be linearly accessed from a given logical stream position.
    5858    virtual llvm::Value * getLinearlyAccessibleItems(llvm::Value * fromPosition) const;
    59    
     59    virtual llvm::Value * getLinearlyAccessibleBlocks(llvm::Value * fromBlock) const;
    6060protected:
    6161
     
    143143    void createCopyBack(llvm::Value * self, llvm::Value * overflowItems) const;
    144144   
    145    
     145
     146       
     147       
    146148protected:
    147149    llvm::Value * getStreamSetBlockPtr(llvm::Value * self, llvm::Value * blockIndex) const override;
     
    149151    size_t mOverflowBlocks;
    150152
     153};
     154   
     155class SwizzledCopybackBuffer : public StreamSetBuffer {
     156public:
     157    static inline bool classof(const StreamSetBuffer * b) {return b->getBufferKind() == BufferKind::SwizzledCopybackBuffer;}
     158   
     159    SwizzledCopybackBuffer(IDISA::IDISA_Builder * b, llvm::Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned fieldwidth = 64, unsigned AddressSpace = 0);
     160   
     161    void allocateBuffer() override;
     162   
     163    // Generate copyback code for the given number of overflowItems.
     164    void createCopyBack(llvm::Value * self, llvm::Value * overflowItems) const;
     165protected:
     166    llvm::Value * getStreamSetBlockPtr(llvm::Value * self, llvm::Value * blockIndex) const override;
     167private:
     168    size_t mOverflowBlocks;
     169    unsigned mFieldWidth;
     170   
    151171};
    152172
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5310 r5355  
    99#include <cc/cc_compiler.h>                        // for CC_Compiler
    1010#include <kernels/deletion.h>                      // for DeletionKernel
     11#include <kernels/swizzle.h>                      // for DeletionKernel
    1112#include <kernels/mmap_kernel.h>                   // for MMapSourceKernel
    1213#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
     
    4647static cl::opt<std::string> outputFile(cl::Positional, cl::desc("<output file>"),  cl::Required, cl::cat(u8u16Options));
    4748static cl::opt<bool> segmentPipelineParallel("enable-segment-pipeline-parallel", cl::desc("Enable multithreading with segment pipeline parallelism."), cl::cat(u8u16Options));
     49static cl::opt<bool> enableAVXdel("enable-AVX-deletion", cl::desc("Enable AVX2 deletion algorithms."), cl::cat(u8u16Options));
    4850static cl::opt<bool> mMapBuffering("mmap-buffering", cl::desc("Enable mmap buffering."), cl::cat(u8u16Options));
    4951static cl::opt<bool> memAlignBuffering("memalign-buffering", cl::desc("Enable posix_memalign buffering."), cl::cat(u8u16Options));
     
    253255}
    254256
    255 Function * u8u16Pipeline(Module * mod, IDISA::IDISA_Builder * iBuilder) {
     257Function * u8u16PipelineAVX2(Module * mod, IDISA::IDISA_Builder * iBuilder) {
    256258
    257259    const unsigned segmentSize = codegen::SegmentSize;
     
    277279    fileSize->setName("fileSize");
    278280
     281    // File data from mmap
    279282    ExternalFileBuffer ByteStream(iBuilder, iBuilder->getStreamSetTy(1, 8));
    280283
     284    MMapSourceKernel mmapK(iBuilder, segmentSize);
     285    mmapK.generateKernel({}, {&ByteStream});
     286    mmapK.setInitialArguments({fileSize});
     287   
     288    // Transposed bits from s2p
    281289    CircularBuffer BasisBits(iBuilder, iBuilder->getStreamSetTy(8), segmentSize * bufferSegments);
    282290
     291    S2PKernel s2pk(iBuilder);
     292    s2pk.generateKernel({&ByteStream}, {&BasisBits});
     293   
     294    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
    283295    CircularBuffer U8u16Bits(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
    284296    CircularBuffer DelMask(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
    285297    CircularBuffer ErrorMask(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
    286298
     299    PabloKernel u8u16k(iBuilder, "u8u16",
     300                       {Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"}},
     301                       {Binding{iBuilder->getStreamSetTy(16, 1), "u16bit"},
     302                           Binding{iBuilder->getStreamSetTy(1, 1), "delMask"},
     303                           Binding{iBuilder->getStreamSetTy(1, 1), "errMask"}}, {});
     304   
     305    u8u16_pablo(&u8u16k);
     306    u8u16k.generateKernel({&BasisBits}, {&U8u16Bits, &DelMask, &ErrorMask});
     307   
     308   
     309    // Apply a deletion algorithm to discard all but the final position of the UTF-8
     310    // sequences for each UTF-16 code unit.
     311    CircularBuffer u16CompressedInFields(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
     312    CircularBuffer DeletionCounts(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
     313
     314    DeleteByPEXTkernel delK(iBuilder, 64, 16);
     315    delK.generateKernel({&U8u16Bits, &DelMask}, {&u16CompressedInFields, &DeletionCounts});
     316   
     317    // Swizzle for sequential compression within SIMD lanes.
     318    CircularBuffer SwizzleFields0(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
     319    CircularBuffer SwizzleFields1(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
     320    CircularBuffer SwizzleFields2(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
     321    CircularBuffer SwizzleFields3(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
     322    SwizzleGenerator swizzleK(iBuilder, 16, 4, 1);
     323    swizzleK.generateKernel({&u16CompressedInFields}, {&SwizzleFields0, &SwizzleFields1, &SwizzleFields2, &SwizzleFields3});
     324   
     325    //  Produce fully compressed swizzled UTF-16 bit streams
     326    SwizzledCopybackBuffer u16Swizzle0(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
     327    SwizzledCopybackBuffer u16Swizzle1(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
     328    SwizzledCopybackBuffer u16Swizzle2(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
     329    SwizzledCopybackBuffer u16Swizzle3(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
     330    //
     331    SwizzledBitstreamCompressByCount compressK(iBuilder, 16);
     332    compressK.generateKernel({&DeletionCounts, &SwizzleFields0, &SwizzleFields1, &SwizzleFields2, &SwizzleFields3},
     333                             {&u16Swizzle0, &u16Swizzle1, &u16Swizzle2, &u16Swizzle3});
     334   
     335    // Produce unswizzled UTF-16 bit streams
     336    //
     337    CircularBuffer u16bits(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
     338    SwizzleGenerator unSwizzleK(iBuilder, 16, 1, 4);
     339    unSwizzleK.setName("unswizzle");
     340    unSwizzleK.generateKernel({&u16Swizzle0, &u16Swizzle1, &u16Swizzle2, &u16Swizzle3}, {&u16bits});
     341   
     342    // Different choices for the output buffer depending on chosen option.
     343    ExternalFileBuffer U16external(iBuilder, iBuilder->getStreamSetTy(1, 16));
     344    CircularBuffer U16out(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments);
     345
     346    P2S16Kernel p2sk(iBuilder);
     347
     348    //P2S16KernelWithCompressedOutput p2sk(iBuilder);
     349
     350    FileSink outK(iBuilder, 16);
     351    if (mMapBuffering || memAlignBuffering) {
     352        p2sk.generateKernel({&u16bits}, {&U16external});
     353        outK.generateKernel({&U16external}, {});
     354    } else {
     355        p2sk.generateKernel({&u16bits}, {&U16out});
     356        outK.generateKernel({&U16out}, {});
     357    }
     358   
     359    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
     360
     361    ByteStream.setStreamSetBuffer(inputStream, fileSize);
     362    BasisBits.allocateBuffer();
     363    U8u16Bits.allocateBuffer();
     364    DelMask.allocateBuffer();
     365    ErrorMask.allocateBuffer();
     366    u16CompressedInFields.allocateBuffer();
     367    DeletionCounts.allocateBuffer();
     368    SwizzleFields0.allocateBuffer();
     369    SwizzleFields1.allocateBuffer();
     370    SwizzleFields2.allocateBuffer();
     371    SwizzleFields3.allocateBuffer();
     372    u16Swizzle0.allocateBuffer();
     373    u16Swizzle1.allocateBuffer();
     374    u16Swizzle2.allocateBuffer();
     375    u16Swizzle3.allocateBuffer();
     376    u16bits.allocateBuffer();
     377    if (mMapBuffering || memAlignBuffering) {
     378        U16external.setEmptyBuffer(outputStream);
     379    } else {
     380        U16out.allocateBuffer();
     381    }
     382    Value * fName = iBuilder->CreatePointerCast(iBuilder->CreateGlobalString(outputFile.c_str()), iBuilder->getInt8PtrTy());
     383    outK.setInitialArguments({fName});
     384
     385    if (segmentPipelineParallel){
     386        generateSegmentParallelPipeline(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &swizzleK, &compressK, &unSwizzleK, &p2sk, &outK});
     387    } else {
     388        generatePipelineLoop(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &swizzleK, &compressK, &unSwizzleK, &p2sk, &outK});
     389    }
     390
     391    iBuilder->CreateRetVoid();
     392    return main;
     393}
     394
     395
     396Function * u8u16Pipeline(Module * mod, IDISA::IDISA_Builder * iBuilder) {
     397   
     398    const unsigned segmentSize = codegen::SegmentSize;
     399    const unsigned bufferSegments = codegen::ThreadNum+1;
     400   
     401    assert (iBuilder);
     402   
     403    Type * const size_ty = iBuilder->getSizeTy();
     404    Type * const voidTy = iBuilder->getVoidTy();
     405    Type * const bitBlockType = iBuilder->getBitBlockType();
     406    Type * const inputType = ArrayType::get(ArrayType::get(bitBlockType, 8), 1)->getPointerTo();
     407    Type * const outputType = ArrayType::get(ArrayType::get(bitBlockType, 16), 1)->getPointerTo();
     408   
     409    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, inputType, outputType, size_ty, nullptr));
     410    main->setCallingConv(CallingConv::C);
     411    Function::arg_iterator args = main->arg_begin();
     412   
     413    Value * const inputStream = &*(args++);
     414    inputStream->setName("inputStream");
     415    Value * const outputStream = &*(args++);
     416    outputStream->setName("outputStream");
     417    Value * const fileSize = &*(args++);
     418    fileSize->setName("fileSize");
     419   
     420    ExternalFileBuffer ByteStream(iBuilder, iBuilder->getStreamSetTy(1, 8));
     421   
     422    CircularBuffer BasisBits(iBuilder, iBuilder->getStreamSetTy(8), segmentSize * bufferSegments);
     423   
     424    CircularBuffer U8u16Bits(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
     425    CircularBuffer DelMask(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
     426    CircularBuffer ErrorMask(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
     427   
    287428    CircularBuffer U16Bits(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
    288429   
    289430    CircularBuffer DeletionCounts(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
    290 
     431   
    291432    // Different choices for the output buffer depending on chosen option.
    292433    ExternalFileBuffer U16external(iBuilder, iBuilder->getStreamSetTy(1, 16));
    293434    CircularCopybackBuffer U16out(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments, 1 /*overflow block*/);
    294 
     435   
    295436    MMapSourceKernel mmapK(iBuilder, segmentSize);
    296437    mmapK.generateKernel({}, {&ByteStream});
     
    298439   
    299440    S2PKernel s2pk(iBuilder);
    300 
     441   
    301442    s2pk.generateKernel({&ByteStream}, {&BasisBits});
    302 
     443   
    303444    PabloKernel u8u16k(iBuilder, "u8u16",
    304445                       {Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"}},
    305446                       {Binding{iBuilder->getStreamSetTy(16, 1), "u16bit"},
    306                         Binding{iBuilder->getStreamSetTy(1, 1), "delMask"},
    307                         Binding{iBuilder->getStreamSetTy(1, 1), "errMask"}}, {});
    308 
     447                           Binding{iBuilder->getStreamSetTy(1, 1), "delMask"},
     448                           Binding{iBuilder->getStreamSetTy(1, 1), "errMask"}}, {});
     449   
    309450    u8u16_pablo(&u8u16k);
    310 
     451   
    311452    u8u16k.generateKernel({&BasisBits}, {&U8u16Bits, &DelMask, &ErrorMask});
    312 
     453   
    313454    DeletionKernel delK(iBuilder, iBuilder->getBitBlockWidth()/16, 16);
    314455    delK.generateKernel({&U8u16Bits, &DelMask}, {&U16Bits, &DeletionCounts});
    315 
     456   
    316457    P2S16KernelWithCompressedOutput p2sk(iBuilder);
    317 
     458   
    318459    FileSink outK(iBuilder, 16);
    319460    if (mMapBuffering || memAlignBuffering) {
     
    325466    }
    326467    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
    327 
     468   
    328469    ByteStream.setStreamSetBuffer(inputStream, fileSize);
    329470    BasisBits.allocateBuffer();
     
    340481    Value * fName = iBuilder->CreatePointerCast(iBuilder->CreateGlobalString(outputFile.c_str()), iBuilder->getInt8PtrTy());
    341482    outK.setInitialArguments({fName});
    342 
     483   
    343484    if (segmentPipelineParallel){
    344485        generateSegmentParallelPipeline(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &p2sk, &outK});
     
    346487        generatePipelineLoop(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &p2sk, &outK});
    347488    }
    348 
     489   
    349490    iBuilder->CreateRetVoid();
    350491    return main;
    351492}
    352 
    353 
    354493
    355494
     
    364503    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
    365504
    366     llvm::Function * main_IR = u8u16Pipeline(M, idb);
    367    
     505    llvm::Function * main_IR = (enableAVXdel && AVX2_available() && codegen::BlockSize==256) ? u8u16PipelineAVX2(M, idb) : u8u16Pipeline(M, idb);
     506
    368507    verifyModule(*M, &dbgs());
    369508    u8u16Engine = JIT_to_ExecutionEngine(M);   
Note: See TracChangeset for help on using the changeset viewer.