Changeset 6004


Ignore:
Timestamp:
Apr 30, 2018, 7:47:31 AM (3 weeks ago)
Author:
cameron
Message:

Restructuring step for DeletionKernel?: move partial sum popcount in p2sWithCompress

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5985 r6004  
    3939}
    4040
    41 inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
    42     Value * field = iBuilder->simd_popcount(fw, mask);
    43     const auto count = iBuilder->getBitBlockWidth() / fw;
    44     for (unsigned move = 1; move < count; move *= 2) {
    45         field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
    46     }
    47     return field;
    48 }
    49 
     41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
     42// Kernel inputs: stream_count data streams plus one del_mask stream
     43// Outputs: the deleted streams, plus a partial sum popcount
     44
     45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     46    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
     47    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
     48    for (unsigned j = 0; j < mStreamCount; ++j) {
     49        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
     50        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
     51        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
     52    }
     53    Value * unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
     54    iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
     55}
     56
     57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
     58    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
     59    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
     60    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     61    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
     62    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
     63    for (unsigned j = 0; j < mStreamCount; ++j) {
     64        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
     65        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
     66        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
     67    }
     68    Value * const unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
     69    iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
     70}
     71
     72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
     73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     74                      {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
     75                          Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
     76                      {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
     77                          Binding{iBuilder->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
     78                      {}, {}, {})
     79, mDeletionFieldWidth(fieldWidth)
     80, mStreamCount(streamCount) {
     81}
     82
     83   
     84   
    5085SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
    5186: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
     
    290325
    291326    return swizzleSets;
    292 }
    293 
    294 // Apply deletion to a set of stream_count input streams to produce a set of output streams.
    295 // Kernel inputs: stream_count data streams plus one del_mask stream
    296 // Outputs: the deleted streams, plus a partial sum popcount
    297 
    298 void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    299     Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    300     const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
    301     for (unsigned j = 0; j < mStreamCount; ++j) {
    302         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    303         Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
    304         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    305     }
    306     Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
    307     iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    308 }
    309 
    310 void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
    311     IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
    312     Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
    313     Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    314     Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    315     const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
    316     for (unsigned j = 0; j < mStreamCount; ++j) {
    317         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    318         Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
    319         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    320     }
    321     Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
    322     iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    323 }
    324 
    325 DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
    326 : BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    327               {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    328                Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    329               {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
    330                Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
    331               {}, {}, {})
    332 , mDeletionFieldWidth(fieldWidth)
    333 , mStreamCount(streamCount) {
    334327}
    335328
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r5985 r6004  
    1010namespace IDISA { class IDISA_Builder; }
    1111
     12namespace kernel {
     13
    1214//
    13 // Parallel Prefix Deletion
     15// Parallel Prefix Deletion Kernel
    1416// see Parallel Prefix Compress in Henry S. Warren, Hacker's Delight, Chapter 7
    15 // 
     17//
    1618// Given that we want to delete bits within fields of width fw, moving
    1719// nondeleted bits to the right, the parallel prefix compress method can
    18 // be applied.   This requires a preprocessing step to compute log2(fw) 
     20// be applied.   This requires a preprocessing step to compute log2(fw)
    1921// masks that can be used to select bits to be moved in each step of the
    2022// algorithm.
    2123//
    22 
    23 namespace kernel {
     24class DeletionKernel final : public BlockOrientedKernel {
     25public:
     26    DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount);
     27    bool isCachable() const override { return true; }
     28    bool hasSignature() const override { return false; }
     29protected:
     30    void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     31    void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
     32private:
     33    const unsigned mDeletionFieldWidth;
     34    const unsigned mStreamCount;
     35};
    2436
    2537/*
     
    4355    const unsigned mSwizzleSetCount;
    4456    const unsigned mPEXTWidth;
    45 };
    46 
    47 class DeletionKernel final : public BlockOrientedKernel {
    48 public:
    49     DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount);
    50     bool isCachable() const override { return true; }
    51     bool hasSignature() const override { return false; }
    52 protected:
    53     void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    54     void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
    55 private:
    56     const unsigned mDeletionFieldWidth;
    57     const unsigned mStreamCount;
    5857};
    5958
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp

    r5771 r6004  
    5151}
    5252
     53inline Value * partial_sum_popcounts(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * popcounts) {
     54    Value * summed_counts = popcounts;
     55    const auto count = iBuilder->getBitBlockWidth() / fw;
     56    for (unsigned move = 1; move < count; move *= 2) {
     57        summed_counts = iBuilder->simd_add(fw, summed_counts, iBuilder->mvmd_slli(fw, summed_counts, move));
     58    }
     59    return summed_counts;
     60}
     61
    5362void P2SKernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    5463    IntegerType * i32 = b->getInt32Ty();
    5564    PointerType * bitBlockPtrTy = PointerType::get(b->getBitBlockType(), 0);
     65    unsigned const unitsPerRegister = b->getBitBlockWidth()/8;
    5666
    5767    Value * basisBits[8];
     
    6272    p2s(b, basisBits, bytePack);
    6373
    64     unsigned units_per_register = b->getBitBlockWidth()/8;
    65     Value * delCountBlock_ptr = b->getInputStreamBlockPtr("deletionCounts", b->getInt32(0));
    66     Value * unit_counts = b->fwCast(units_per_register, b->CreateBlockAlignedLoad(delCountBlock_ptr));
     74    Value * const fieldCounts = b->loadInputStreamBlock("fieldCounts", b->getInt32(0));
     75    Value * unitCounts = partial_sum_popcounts(b, unitsPerRegister, fieldCounts);
    6776
    6877    Value * output_ptr = b->getOutputStreamBlockPtr("byteStream", b->getInt32(0));
     
    7180    for (unsigned j = 0; j < 8; ++j) {
    7281        b->CreateStore(bytePack[j], b->CreateBitCast(b->CreateGEP(output_ptr, offset), bitBlockPtrTy));
    73         offset = b->CreateZExt(b->CreateExtractElement(unit_counts, b->getInt32(j)), i32);
     82        offset = b->CreateZExt(b->CreateExtractElement(unitCounts, b->getInt32(j)), i32);
    7483    }
    7584
     
    99108    }
    100109}
    101        
     110   
    102111void P2S16KernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    103112    IntegerType * i32Ty = b->getInt32Ty();
     
    105114    PointerType * bitBlockPtrTy = b->getBitBlockType()->getPointerTo();
    106115    ConstantInt * blockMask = b->getSize(b->getBitBlockWidth() - 1);
    107 
     116    unsigned const unitsPerRegister = b->getBitBlockWidth()/16;
     117   
    108118    Value * hi_input[8];
    109119    for (unsigned j = 0; j < 8; ++j) {
     
    120130    p2s(b, lo_input, lo_bytes);
    121131
    122     Value * const delCount = b->loadInputStreamBlock("deletionCounts", b->getInt32(0));
    123     Value * const unitCounts = b->fwCast(b->getBitBlockWidth() / 16, delCount);
     132    Value * const fieldCounts = b->loadInputStreamBlock("fieldCounts", b->getInt32(0));
     133    Value * unitCounts = partial_sum_popcounts(b, unitsPerRegister, fieldCounts);
     134   
    124135    Value * outputPtr = b->getOutputStreamBlockPtr("i16Stream", b->getInt32(0));
    125136    outputPtr = b->CreatePointerCast(outputPtr, int16PtrTy);
     
    158169P2SKernelWithCompressedOutput::P2SKernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & b)
    159170: BlockOrientedKernel("p2s_compress",
    160               {Binding{b->getStreamSetTy(8, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "deletionCounts"}},
     171              {Binding{b->getStreamSetTy(8, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "fieldCounts"}},
    161172              {Binding{b->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}},
    162173              {}, {}, {}) {
     
    173184P2S16KernelWithCompressedOutput::P2S16KernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & b)
    174185: BlockOrientedKernel("p2s_16_compress",
    175               {Binding{b->getStreamSetTy(16, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "deletionCounts"}},
     186              {Binding{b->getStreamSetTy(16, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "fieldCounts"}},
    176187              {Binding{b->getStreamSetTy(1, 16), "i16Stream", BoundedRate(0, 1)}},
    177188              {},
Note: See TracChangeset for help on using the changeset viewer.