Ignore:
Timestamp:
Apr 24, 2018, 2:57:34 PM (13 months ago)
Author:
nmedfort
Message:

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5926 r5985  
    4848}
    4949
    50 SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
    51 : BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
    52                   {Binding{iBuilder->getStreamSetTy(), "delMaskSet"}, Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}},
     50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
     51: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
     52                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
    5353                  {}, {}, {}, {})
    54 , mDelCountFieldWidth(fw)
    5554, mStreamCount(streamCount)
    56 , mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
     55, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
    5756// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
    5857// to prevent rounding errors.
     
    6059, mPEXTWidth(PEXT_width)
    6160{
    62     assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
     61    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
    6362        && "mDelCountFieldWidth must be a power of 2");
    6463    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
     
    6665
    6766    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
    68     mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    69     addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
     67    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
     68
     69    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
     70    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
    7071    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    71         mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
    72             "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    73         addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    74     }
    75     addScalar(iBuilder->getSizeTy(), "pendingOffset");
    76 }
    77 
    78 void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     72        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
     73        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     74    }
     75    addScalar(b->getSizeTy(), "pendingOffset");
     76}
     77
     78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    7979    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
    80     Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    81     const auto masks = get_PEXT_masks(iBuilder, delMask);
    82     generateProcessingLoop(iBuilder, masks, delMask);
    83 }
    84 
    85 void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
    86     const auto originalProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
    87     IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
    88     Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
    89     Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    90     Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    91     const auto masks = get_PEXT_masks(iBuilder, delMask);
    92     generateProcessingLoop(iBuilder, masks, delMask);
    93 
    94     const auto newProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
    95     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    96     Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
    97    
    98     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    99     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    100     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
    101 
    102     const auto deltaOutputIndex = iBuilder->CreateSub(
    103             iBuilder->CreateUDiv(newProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth())),
    104             iBuilder->CreateUDiv(originalProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth()))
    105     );
    106     outputIndex = iBuilder->CreateAdd(outputIndex, iBuilder->CreateMul(deltaOutputIndex, iBuilder->getSize(iBuilder->getBitBlockWidth() / mDelCountFieldWidth)));
    107 
    108     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
    109 
    110     // Write the pending data.
    111     for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    112         Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
    113         Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0), outputIndex);
    114         iBuilder->CreateBlockAlignedStore(pendingData, outputStreamPtr);
    115     }
    116     iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
    117 }
    118 
    119 std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
    120     // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
    121     // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
    122     Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask));
    123     std::vector<Value *> masks;
    124     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    125         masks.push_back(iBuilder->CreateExtractElement(m, i));
    126     }
    127     return masks;
    128 }
    129 
    130 void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
    131                                                 Value * delMask) {
    132     Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
    133     std::vector<Value *> counts;
    134     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
    135         // Store the deletion counts for each PEXT field
    136         counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
    137     }
    138 
    139     generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
     80    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
     81    generateProcessingLoop(b, delMask, false);
     82}
     83
     84void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
     85    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
     86    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
     87    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     88    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
     89    generateProcessingLoop(b, delMask, true);
    14090}
    14191
     
    14393What this function does in pseudo code:
    14494for (mSwizzleFactor)
    145         create a swizzle set containing mSwizzleFactor blocks
    146         apply PEXT to each block in the swizzle set
    147         store the swizzleSet in PEXTedSwizzleSets vector
    148        
     95    create a swizzle set containing mSwizzleFactor blocks
     96    apply PEXT to each block in the swizzle set
     97    store the swizzleSet in PEXTedSwizzleSets vector
     98
    14999for (each swizzle row i)
    150         for (each swizzle set j)
    151                 processes row i in swizzle set j
    152                 store output in pendingData[j]
     100    for (each swizzle set j)
     101        processes row i in swizzle set j
     102        store output in pendingData[j]
    153103*/
    154 void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
    155                                                     std::vector<Value *> counts) {
    156     // For each of the k swizzle sets required to apply PEXT to all input streams
    157     std::vector<std::vector<Value *>> PEXTedSwizzleSets;
    158     for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
    159     // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
    160     // Each block belongs to a different input stream.
    161         std::vector<Value *> input;
    162         unsigned streamSelectionIndex = j * mSwizzleFactor;
    163         for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
    164                 // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
    165             if ( i >= mStreamCount) {
    166                                 input.push_back(iBuilder->allZeroes());
    167             } else {
    168                 input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
    169             }
    170         }
    171         // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
    172         PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
    173     }
    174         // Compress the PEXTedSwizzleSets
     104
     105void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
     106
     107    // selectors marks the positions we want to keep
     108    Value * const selectors = b->CreateNot(delMask);
     109
     110    const auto swizzleSets = makeSwizzleSets(b, selectors);
     111
     112    // Compress the PEXTedSwizzleSets
    175113    // Output is written and committed to the output buffer one swizzle at a time.
    176     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    177     Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
    178    
    179     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    180     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    181     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
    182 
    183     // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
    184     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     114    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
     115    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
     116    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
     117    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
     118    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
     119
     120    // All output groups have the same count.
     121    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
     122    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
     123    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
     124    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
     125
    185126    // There is a separate vector of pending data for each swizzle group.
    186127    std::vector<Value *> pendingData;
    187 
    188128    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    189         pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
    190     }
     129        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
     130    }
     131
     132    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
    191133
    192134    // For each row i
    193135    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     136
    194137        // Generate code for each of the mSwizzleFactor fields making up a block.
    195138        // We load the count for the field and process all swizzle groups accordingly.
    196         Value * newItemCount = counts[i];
    197         //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
    198         Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
    199         Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
    200        
     139        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
     140        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
     141        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
     142        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
     143
     144        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
     145        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
     146
    201147        // Data from the ith swizzle pack of each group is processed
    202148        // according to the same newItemCount, pendingSpace, ...
    203149        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    204             Value * newItems = PEXTedSwizzleSets[j][i];
    205             //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
     150            Value * const newItems = swizzleSets[j][i];
    206151            // Combine as many of the new items as possible into the pending group.
    207             Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
    208                 pendingOffset)));
    209             //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
     152            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
     153            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
     154            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
    210155            // To avoid an unpredictable branch, always store the combined group, whether full or not.
    211             iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), outputIndex));
    212            
     156            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
     157            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
    213158            // Any items in excess of the space available in the current pending group overflow for the next group.
    214             Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
     159            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
    215160            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
    216             pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
    217         }
    218         outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
    219         pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
    220     }
    221    
    222     iBuilder->setScalarField("pendingOffset", pendingOffset);
    223     //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
    224    
    225     Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
    226     Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
    227     for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    228         iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    229         //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
    230     }
    231     iBuilder->setProducedItemCount("outputSwizzle0", produced);
     161            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     162        }
     163
     164        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
     165        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
     166
     167        outputProduced = b->CreateAdd(outputProduced, newItemCount);
     168    }
     169
     170    if (flush) { // incase we selected the overflow group on the final iteration
     171        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
     172        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
     173        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     174            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
     175            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
     176        }
     177    } else {
     178        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     179            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
     180        }
     181        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
     182        b->setScalarField("pendingOffset", pendingOffset);
     183        // unless this is our final stride, don't report partially written fields.
     184        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
     185    }
     186
     187    b->setProducedItemCount("outputSwizzle0", outputProduced);
    232188}
    233189
     
    265221Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
    266222
    267 Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the 
     223Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
    268224compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
    269225
    270226Args:
    271227    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
    272     masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces 
     228    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
    273229        sequentially to PEXT a full block.)
    274230
     
    276232    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
    277233*/
    278 std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder,
    279                                                              const std::vector<Value *> & masks, std::vector<Value *> strms) {
    280     Value * PEXT_func = nullptr;
     234
     235std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
     236
     237    Constant * pext = nullptr;
    281238    if (mPEXTWidth == 64) {
    282         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     239        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
    283240    } else if (mPEXTWidth == 32) {
    284         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    285     }
    286    
    287     std::vector<Value *> output;     
    288     for (unsigned i = 0; i < strms.size(); i++) {
    289         Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
    290         output.push_back(Constant::getNullValue(v->getType()));
    291     }
    292 
    293     // For each of the input streams
    294     for (unsigned j = 0; j < strms.size(); j++) {
    295         Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
    296         // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
    297         for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    298             Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
    299             Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
    300             /*
    301              We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
    302              E.g.:
    303 
    304                *i*
    305             *j* a b c d strms[0]
    306                 e f g h
    307                 i j k l
    308                 m n o p
    309 
    310              Apply pext deletion at each position, then swizzle results:
    311                *j*
    312             *i* a` e` i` m` output[0]
    313                 b` f` j` n`
    314                 c` g` k` o` 
    315                 d` i` l` p`         
    316             */   
    317             output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j);
    318             /*
    319             numCompressedBits = 0
    320 
    321             for (each swizzleField position j)
    322                 for (each input swizzle i)
    323                     get PEXTed_field
    324                     Shift PEXTed_field left by "numCompressedBits" (in output[i])
    325                     OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
    326                 numCompressedBits += popcount(mask[i])
    327             */
    328         }
    329     }
    330    
    331     return output;
    332 }
    333 
    334 Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
    335     Value * PEXT_func = nullptr;
    336     if (mPEXTWidth == 64) {
    337         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    338     } else if (mPEXTWidth == 32) {
    339         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    340     }
    341        
    342     Value * v = iBuilder->fwCast(mPEXTWidth, strm);
    343     Value * output = Constant::getNullValue(v->getType());
    344     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    345         Value * field = iBuilder->CreateExtractElement(v, i);
    346         Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
    347         output = iBuilder->CreateInsertElement(output, compressed, i);
    348     }
    349     return output;
     241        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
     242    }
     243
     244    Value * const m = b->fwCast(mPEXTWidth, selectors);
     245
     246    std::vector<Value *> masks(mSwizzleFactor);
     247    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     248        masks[i] = b->CreateExtractElement(m, i);
     249
     250    }
     251
     252    std::vector<std::vector<Value *>> swizzleSets;
     253    swizzleSets.reserve(mSwizzleSetCount);
     254
     255    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
     256
     257    UndefValue * const outputInitializer = UndefValue::get(vecTy);
     258
     259    std::vector<Value *> input(mSwizzleFactor);
     260    // For each of the k swizzle sets required to apply PEXT to all input streams
     261    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
     262
     263        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
     264            const unsigned k = (i * mSwizzleFactor) + j;
     265            if (k < mStreamCount) {
     266                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
     267            } else {
     268                input[j] = Constant::getNullValue(vecTy);
     269            }
     270        }
     271
     272        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
     273        // then splat the pext mask and apply it to each output row
     274
     275        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
     276        // For each of the input streams
     277        for (unsigned j = 0; j < mSwizzleFactor; j++) {
     278            for (unsigned k = 0; k < mSwizzleFactor; k++) {
     279                // Load block j,k
     280                Value * const field = b->CreateExtractElement(input[j], k);
     281                // Apply PEXT deletion
     282                Value * const selected = b->CreateCall(pext, {field, masks[k]});
     283                // Then store it as our k,j-th output
     284                output[k] = b->CreateInsertElement(output[k], selected, j);
     285            }
     286        }
     287
     288        swizzleSets.emplace_back(output);
     289    }
     290
     291    return swizzleSets;
    350292}
    351293
     
    392334}
    393335
    394 const unsigned PEXT_width = 64;
    395 
    396 inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
    397     Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
    398     std::vector<Value *> masks;
    399     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    400         masks.push_back(iBuilder->CreateExtractElement(m, i));
    401     }
    402     return masks;
    403 }
    404 
    405 // Apply PEXT deletion to a collection of blocks and swizzle the result.
    406 // strms contains the blocks to process
    407 inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
    408     Value * PEXT_func = nullptr;
    409     if (PEXT_width == 64) {
    410         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    411     } else if (PEXT_width == 32) {
    412         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    413     }
    414    
    415     std::vector<Value *> output;     
    416     for (unsigned i = 0; i < strms.size(); i++) {
    417         Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
    418         output.push_back(Constant::getNullValue(v->getType()));
    419     }
    420 
    421     // For each of the input streams
    422     for (unsigned j = 0; j < strms.size(); j++) {
    423         Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
    424         // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
    425         for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    426             Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
    427             Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
    428             /*
    429              We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
    430 
    431              a b c d
    432              e f g h
    433              i j k l
    434              m n o p
    435 
    436              Apply pext deletion at each position, then swizzle results:
    437 
    438              a` e` i` m`
    439              b` f` j` n`
    440              c` g` k` o` 
    441              d` i` l` p`         
    442             */     
    443             output[i] = iBuilder->CreateInsertElement(output[i], compressed, j);
    444         }
    445     }
    446    
    447     return output;
    448 }
    449 
    450 inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
    451     Value * PEXT_func = nullptr;
    452     if (PEXT_width == 64) {
    453         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    454     } else if (PEXT_width == 32) {
    455         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    456     }
    457        
    458     Value * v = iBuilder->fwCast(PEXT_width, strm);
    459     Value * output = Constant::getNullValue(v->getType());
    460     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    461         Value * field = iBuilder->CreateExtractElement(v, i);
    462         Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
    463         output = iBuilder->CreateInsertElement(output, compressed, i);
    464     }
    465     return output;
    466 }
    467 
    468336// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
    469337// Kernel inputs: stream_count data streams plus one del_mask stream
     
    472340void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    473341    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    474     const auto masks = get_PEXT_masks(iBuilder, delMask);
    475     generateProcessingLoop(iBuilder, masks, delMask);
     342    generateProcessingLoop(iBuilder, delMask);
    476343}
    477344
     
    481348    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    482349    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    483     const auto masks = get_PEXT_masks(iBuilder, delMask);
    484     generateProcessingLoop(iBuilder, masks, delMask);
    485 }
    486 
    487 void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
    488     if (mShouldSwizzle) {
    489         generatePEXTAndSwizzleLoop(iBuilder, masks);
    490     } else {
    491         generatePEXTLoop(iBuilder, masks);
    492     }
    493     //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     350    generateProcessingLoop(iBuilder, delMask);
     351}
     352
     353void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, Value * delMask) {
     354    Constant * PEXT_func = nullptr;
     355    if (mPEXTWidth == 64) {
     356        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     357    } else if (mPEXTWidth == 32) {
     358        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     359    }
     360    std::vector<Value *> masks(mSwizzleFactor);
     361    Value * const m = iBuilder->fwCast(mSwizzleFactor, iBuilder->simd_not(delMask));
     362    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     363        masks.push_back(iBuilder->CreateExtractElement(m, i));
     364    }
     365
     366    for (unsigned i = 0; i < mStreamCount; ++i) {
     367        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i));
     368        Value * value = iBuilder->fwCast(mPEXTWidth, input);
     369        Value * output = UndefValue::get(value->getType());
     370        for (unsigned j = 0; j < mSwizzleFactor; j++) {
     371            Value * field = iBuilder->CreateExtractElement(value, j);
     372            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[j]});
     373            output = iBuilder->CreateInsertElement(output, compressed, j);
     374        }
     375        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(i), output);
     376    }
    494377    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    495378    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    496379}
    497380
    498 void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
    499     for (unsigned j = 0; j < mStreamCount; ++j) {
    500         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    501         Value * output = apply_PEXT_deletion(iBuilder, masks, input);
    502         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    503     }
    504 }
    505 
    506 void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
    507     // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
    508     // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
    509     for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
    510         std::vector<Value *> input;
    511         unsigned streamSelectionIndex = j * mSwizzleFactor;
    512         for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
    513                 // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
    514             if ( i >= mStreamCount) {
    515                                 input.push_back(iBuilder->allZeroes());
    516             } else {
    517                 input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
    518             }
    519         }
    520         std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
    521         for (unsigned i = 0; i < mSwizzleFactor; i++) {
    522              iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
    523         }
    524     }
    525 }
    526 
    527 DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
    528 : BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
    529                   {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    530                       Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    531                   {}, {}, {}, {})
     381DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
     382: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
     383              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
     384                  Binding{b->getStreamSetTy(), "delMaskSet"}},
     385              {}, {}, {}, {})
    532386, mDelCountFieldWidth(fw)
    533387, mStreamCount(streamCount)
    534 , mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
    535 , mShouldSwizzle(shouldSwizzle)
    536 {
    537     if(mShouldSwizzle) {       
    538         for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) {
    539             mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
    540         }
    541     } else {
    542         // No swizzling. Output results as single stream set
    543         mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
    544     }
    545     mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
     388, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
     389, mPEXTWidth(PEXT_width) {
     390    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
     391    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
    546392}
    547393
     
    611457    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    612458        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
    613     //iBuilder->CallPrintInt("newItemCount", newItemCount);
    614459        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
    615460        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     
    619464        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    620465            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
    621         //iBuilder->CallPrintRegister("newItems", newItems);
    622466            // Combine as many of the new items as possible into the pending group.
    623467            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
    624             //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
    625             // To avoid an unpredictable branch, always store the combined group, whether full or not.
    626                
     468            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
    627469            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
    628470            // Any items in excess of the space available in the current pending group overflow for the next group.
     
    639481    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    640482        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    641         //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
    642 
    643483    }
    644484    iBuilder->setProducedItemCount("outputSwizzle0", produced);
Note: See TracChangeset for help on using the changeset viewer.