Ignore:
Timestamp:
Jun 30, 2017, 2:07:34 PM (2 years ago)
Author:
cameron
Message:

Integrated AVX deletion kernel

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5440 r5540  
    4646    }
    4747    return field;
     48}
     49
     50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
     51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
     52                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}, Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
     53                  {}, {}, {}, {})
     54, mDelCountFieldWidth(fw)
     55, mStreamCount(streamCount)
     56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
     57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
     58// to prevent rounding errors.
     59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
     60, mPEXTWidth(PEXT_width)
     61{
     62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
     63        && "mDelCountFieldWidth must be a power of 2");
     64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
     65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
     66   
     67    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
     68    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
     69    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
     70        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
     71            "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
     72        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     73    }
     74    addScalar(iBuilder->getSizeTy(), "pendingOffset");
     75}
     76
     77void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     78    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
     79    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
     80    const auto masks = get_PEXT_masks(iBuilder, delMask);
     81    generateProcessingLoop(iBuilder, masks, delMask);
     82}
     83
     84void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
     85    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
     86    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
     87    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     88    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
     89    const auto masks = get_PEXT_masks(iBuilder, delMask);
     90    generateProcessingLoop(iBuilder, masks, delMask);
     91    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
     92    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
     93   
     94    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     95    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
     96    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
     97    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     98
     99    // Write the pending data.
     100    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     101        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
     102        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
     103        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
     104    }
     105    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
     106}
     107
     108std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
     109    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
     110    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
     111    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask));
     112    std::vector<Value *> masks;
     113    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
     114        masks.push_back(iBuilder->CreateExtractElement(m, i));
     115    }
     116    return masks;
     117}
     118
     119void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
     120                                                Value * delMask) {
     121    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
     122    std::vector<Value *> counts;
     123    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
     124        // Store the deletion counts for each PEXT field
     125        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
     126    }
     127
     128    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
     129}
     130
     131/*
     132What this function does in pseudo code:
     133for (mSwizzleFactor)
     134        create a swizzle set containing mSwizzleFactor blocks
     135        apply PEXT to each block in the swizzle set
     136        store the swizzleSet in PEXTedSwizzleSets vector
     137       
     138for (each swizzle row i)
     139        for (each swizzle set j)
     140                processes row i in swizzle set j
     141                store output in pendingData[j]
     142*/
     143void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
     144                                                    std::vector<Value *> counts) {
     145    // For each of the k swizzle sets required to apply PEXT to all input streams
     146    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
     147    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
     148    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
     149    // Each block belongs to a different input stream.
     150        std::vector<Value *> input;
     151        unsigned streamSelectionIndex = j * mSwizzleFactor;
     152        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
     153                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
     154            if ( i >= mStreamCount) {
     155                                input.push_back(iBuilder->allZeroes());
     156            } else {
     157                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
     158            }
     159        }
     160        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
     161        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
     162    }
     163        // Compress the PEXTedSwizzleSets
     164    // Output is written and committed to the output buffer one swizzle at a time.
     165    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
     166    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
     167   
     168    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     169    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
     170    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
     171
     172    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
     173    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     174    // There is a separate vector of pending data for each swizzle group.
     175    std::vector<Value *> pendingData;
     176    std::vector<Value *> outputStreamPtr;
     177
     178    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     179        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
     180        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
     181    }
     182
     183    // For each row i
     184    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     185        // Generate code for each of the mSwizzleFactor fields making up a block.
     186        // We load the count for the field and process all swizzle groups accordingly.
     187        Value * newItemCount = counts[i];
     188        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
     189        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
     190        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     191       
     192        // Data from the ith swizzle pack of each group is processed
     193        // according to the same newItemCount, pendingSpace, ...
     194        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     195            Value * newItems = PEXTedSwizzleSets[j][i];
     196            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
     197            // Combine as many of the new items as possible into the pending group.
     198            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
     199                pendingOffset)));
     200            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
     201            // To avoid an unpredictable branch, always store the combined group, whether full or not.             
     202            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
     203           
     204            // Any items in excess of the space available in the current pending group overflow for the next group.
     205            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
     206            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
     207            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     208        }
     209        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
     210        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
     211    }
     212   
     213    iBuilder->setScalarField("pendingOffset", pendingOffset);
     214    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
     215   
     216    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
     217    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
     218    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     219        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
     220        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
     221    }
     222    iBuilder->setProducedItemCount("outputSwizzle0", produced);
     223}
     224
     225/*
     226Apply PEXT deletion to the blocks in strms and swizzle the result.
     227
     228Q: Why is it advantageous to swizzle the PEXTed streams?
     229
     230A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
     231
     232Example:
     233Input:     11101101
     234PEXT mask: 11110000
     235Output:    00001110
     236
     237PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
     238cleared. This has implications if we're working with multiple streams.
     239
     240For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
     241(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
     242Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
     243Stream 2:   qrstuv00 wxy00000 z1000000 23456000
     244Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
     245Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
     246
     247If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
     248should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
     249time:
     250
     251The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
     252
     253Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
     254Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
     255Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
     256Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
     257
     258Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
     259compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
     260
     261Args:
     262    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
     263    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
     264        sequentially to PEXT a full block.)
     265
     266Returns:
     267    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
     268*/
     269std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder,
     270                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
     271    Value * PEXT_func = nullptr;
     272    if (mPEXTWidth == 64) {
     273        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     274    } else if (mPEXTWidth == 32) {
     275        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     276    }
     277   
     278    std::vector<Value *> output;     
     279    for (unsigned i = 0; i < strms.size(); i++) {
     280        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
     281        output.push_back(Constant::getNullValue(v->getType()));
     282    }
     283
     284    // For each of the input streams
     285    for (unsigned j = 0; j < strms.size(); j++) {
     286        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
     287        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
     288        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
     289            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
     290            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
     291            /*
     292             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
     293             E.g.:
     294
     295               *i*
     296            *j* a b c d strms[0]
     297                e f g h
     298                i j k l
     299                m n o p
     300
     301             Apply pext deletion at each position, then swizzle results:
     302               *j*
     303            *i* a` e` i` m` output[0]
     304                b` f` j` n`
     305                c` g` k` o` 
     306                d` i` l` p`         
     307            */   
     308            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j);
     309            /*
     310            numCompressedBits = 0
     311
     312            for (each swizzleField position j)
     313                for (each input swizzle i)
     314                    get PEXTed_field
     315                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
     316                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
     317                numCompressedBits += popcount(mask[i])
     318            */
     319        }
     320    }
     321   
     322    return output;
     323}
     324
     325Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
     326    Value * PEXT_func = nullptr;
     327    if (mPEXTWidth == 64) {
     328        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     329    } else if (mPEXTWidth == 32) {
     330        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     331    }
     332       
     333    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
     334    Value * output = Constant::getNullValue(v->getType());
     335    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
     336        Value * field = iBuilder->CreateExtractElement(v, i);
     337        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
     338        output = iBuilder->CreateInsertElement(output, compressed, i);
     339    }
     340    return output;
    48341}
    49342
     
    309602    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    310603        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
     604        iBuilder->CallPrintInt("newItemCount", newItemCount);
    311605        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
    312606        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     
    316610        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    317611            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
     612            iBuilder->CallPrintRegister("newItems", newItems);
    318613            // Combine as many of the new items as possible into the pending group.
    319614            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
     615            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
    320616            // To avoid an unpredictable branch, always store the combined group, whether full or not.
    321617               
     
    330626    }
    331627    iBuilder->setScalarField("pendingOffset", pendingOffset);
     628    iBuilder->CallPrintInt("pendingOffset", pendingOffset);
     629
    332630   
    333631    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
     
    335633    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    336634        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
     635        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
     636
    337637    }
    338638    iBuilder->setProducedItemCount("outputSwizzle0", produced);
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r5464 r5540  
    2222
    2323namespace kernel {
     24
     25/*
     26Input: a set of bitstreams
     27Output: swizzles containing the input bitstreams with the specified bits deleted
     28*/
     29class SwizzledDeleteByPEXTkernel final : public BlockOrientedKernel {
     30public:
     31    SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width = 64);
     32    bool isCachable() const override { return true; }
     33    bool hasSignature() const override { return false; }
     34protected:
     35    void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     36    void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
     37    std::vector<llvm::Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * del_mask);
     38    void generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks,
     39                                llvm::Value * delMask);
     40    void generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks, std::vector<llvm::Value *> counts);
     41    std::vector<llvm::Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder,
     42                                                                const std::vector<llvm::Value *> & masks, std::vector<llvm::Value *> strms);
     43    llvm::Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks,
     44                                      llvm::Value * strm);
     45private:
     46    const unsigned mDelCountFieldWidth;
     47    const unsigned mStreamCount;
     48    const unsigned mSwizzleFactor;
     49    const unsigned mSwizzleSetCount;
     50    const unsigned mPEXTWidth;
     51    static constexpr const char* mOutputSwizzleNameBase = "outputStreamSet";
     52};
    2453
    2554class DeletionKernel final : public BlockOrientedKernel {
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5486 r5540  
    1212#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
    1313#include <kernels/s2p_kernel.h>                    // for S2PKernel
    14 #include <kernels/stdout_kernel.h>                 // for StdOutKernel
     14#include <kernels/stdout_kernel.h>                 // for StdOutKernel_
    1515#include <llvm/ExecutionEngine/ExecutionEngine.h>  // for ExecutionEngine
    1616#include <llvm/IR/Function.h>                      // for Function, Function...
     
    310310    pxDriver.makeKernelCall(u8u16k, {BasisBits}, {U8u16Bits, DelMask, ErrorMask});
    311311   
    312     // Apply a deletion algorithm to discard all but the final position of the UTF-8
    313     // sequences for each UTF-16 code unit. Swizzle the results.
    314     StreamSetBuffer * SwizzleFields0 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
    315     StreamSetBuffer * SwizzleFields1 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
    316     StreamSetBuffer * SwizzleFields2 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
    317     StreamSetBuffer * SwizzleFields3 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
    318     StreamSetBuffer * DeletionCounts = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
    319    
    320     Kernel * delK = pxDriver.addKernelInstance(make_unique<DeleteByPEXTkernel>(iBuilder, 64, 16, true));
    321     pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3, DeletionCounts});
    322    
    323     //  Produce fully compressed swizzled UTF-16 bit streams
     312    // Allocate space for fully compressed swizzled UTF-16 bit streams
    324313    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
    325314    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
     
    327316    StreamSetBuffer * u16Swizzle3 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
    328317   
    329     Kernel * compressK = pxDriver.addKernelInstance(make_unique<SwizzledBitstreamCompressByCount>(iBuilder, 16));
    330     pxDriver.makeKernelCall(compressK, {DeletionCounts, SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3},
    331                            {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3});
    332    
     318    // Apply a deletion algorithm to discard all but the final position of the UTF-8
     319    // sequences (bit streams) for each UTF-16 code unit. Also compresses and swizzles the result.
     320    Kernel * delK = pxDriver.addKernelInstance(make_unique<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 16));
     321    pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3});
     322
    333323    // Produce unswizzled UTF-16 bit streams
    334324    StreamSetBuffer * u16bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
     
    483473
    484474                       
     475
Note: See TracChangeset for help on using the changeset viewer.