Ignore:
Timestamp:
Mar 2, 2017, 12:18:43 PM (2 years ago)
Author:
cameron
Message:

Swizzled bitstream deletion and -enable-AVX-deletion in u8u16

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5347 r5355  
    11/*
    2  *  Copyright (c) 2016 International Characters.
     2 *  Copyright (c) 2017 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 */
     
    9393}
    9494
    95    
     95
     96
    9697const unsigned PEXT_width = 64;
    9798
     
    134135        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    135136    }
    136     Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     137    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     138    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    137139    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    138140}
     
    149151        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    150152    }
    151     Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     153    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     154    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    152155    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    153156}
     
    165168}
    166169
    167 }
     170   
     171//
     172// This kernel performs final stream compression for a set of N bitstreams, given
     173// (a) a set of bitstreams partially compressed within K-bit fields and stored
     174//     in K-bit swizzled form, and
     175// (b) a stream of deletion/extraction counts per K-bit stride.
     176//
     177// Restrictions:  At present, only K=64 is supported.
     178//                At present, N must be an exact multiple of BLOCK_SIZE/K.
     179//
     180// The kernel always consumes full blocks of input and emits data into the output
     181// buffer in swizzles of K items at a time.   Upon completion of a segment,
     182// up to K-1 pending output items per stream may be stored in the kernel state.
     183//
     184// Note: that both input streams and output streams are stored in swizzled form.
     185//
     186
     187   
     188
     189SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(IDISA::IDISA_Builder * iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
     190    : BlockOrientedKernel(iBuilder, "swizzled_compress", {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
     191, mBitStreamCount(bitStreamCount)
     192    , mFieldWidth(fieldWidth)
     193    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
     194    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
     195        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
     196        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
     197        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     198            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
     199            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), MaxRatio(1)});
     200            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     201        }
     202        addScalar(iBuilder->getSizeTy(), "pendingOffset");
     203}
     204
     205   
     206void SwizzledBitstreamCompressByCount::generateDoBlockMethod() {
     207       
     208    Value * countStreamPtr = iBuilder->CreateBitCast(getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0)), iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
     209   
     210    // Output is written and committed to the output buffer one swizzle at a time.
     211    //
     212    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
     213    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
     214   
     215    Value * outputProduced = getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
     216    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
     217    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
     218
     219    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
     220    Value * pendingOffset = getScalarField("pendingOffset");
     221    // There is a separate vector of pending data for each swizzle group.
     222    std::vector<Value *> pendingData;
     223    std::vector<Value *> outputStreamPtr;
     224    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     225        pendingData.push_back(getScalarField("pendingSwizzleData" + std::to_string(i)));
     226        outputStreamPtr.push_back(getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
     227    }
     228   
     229    // Generate code for each of the mSwizzleFactor fields making up a block.
     230    // We load the count for the field and process all swizzle groups accordingly.
     231    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     232        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
     233        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
     234        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     235       
     236        // Data from the ith swizzle pack of each group is processed
     237        // according to the same newItemCount, pendingSpace, ...
     238        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     239            Value * newItems = loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
     240            // Combine as many of the new items as possible into the pending group.
     241            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
     242            // To avoid an unpredictable branch, always store the combined group, whether full or not.
     243               
     244            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
     245            // Any items in excess of the space available in the current pending group overflow for the next group.
     246            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
     247            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
     248            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     249        }
     250        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
     251        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
     252    }
     253    setScalarField("pendingOffset", pendingOffset);
     254   
     255    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
     256    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
     257    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     258        setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
     259        setProducedItemCount("outputSwizzle" + std::to_string(j), produced);
     260    }
     261}
     262
     263void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(Value * remainingBytes) {
     264    CreateDoBlockMethodCall();
     265    // All the data has been written.  Update the count to include pending data.
     266    Value * pendingOffset = getScalarField("pendingOffset");
     267    Value * produced = iBuilder->CreateAdd(pendingOffset, getProducedItemCount("outputSwizzle0"));
     268    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
     269        setProducedItemCount("outputSwizzle" + std::to_string(j), produced);
     270    }
     271}
     272}
Note: See TracChangeset for help on using the changeset viewer.