Changeset 6013 for icGREP/icgrepdevel/icgrep/kernels/deletion.cpp
 Timestamp:
 May 4, 2018, 3:27:50 PM (17 months ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

icGREP/icgrepdevel/icgrep/kernels/deletion.cpp
r6006 r6013 117 117 , mStreamCount(streamCount) { 118 118 } 119 120 StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount) 121 : MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount), 122 {Binding{kb>getStreamSetTy(streamCount), "sourceStreamSet"}, 123 Binding{kb>getStreamSetTy(), "unitCounts"}}, 124 {Binding{kb>getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}}, 125 {}, {}, {}) 126 , mCompressedFieldWidth(fieldWidth) 127 , mStreamCount(streamCount) { 128 addScalar(kb>getSizeTy(), "pendingItemCount"); 129 for (unsigned i = 0; i < streamCount; i++) { 130 addScalar(kb>getBitBlockType(), "pendingOutputBlock_" + std::to_string(i)); 131 } 132 133 } 134 135 void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) { 136 const unsigned fw = mCompressedFieldWidth; 137 Type * fwTy = b>getIntNTy(fw); 138 Type * sizeTy = b>getSizeTy(); 139 const unsigned numFields = b>getBitBlockWidth()/fw; 140 Constant * zeroSplat = Constant::getNullValue(b>fwVectorType(fw)); 141 Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw)); 142 Constant * numFieldConst = ConstantInt::get(sizeTy, numFields); 143 Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw1)); 144 Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b>getBitBlockWidth()); 145 BasicBlock * entry = b>GetInsertBlock(); 146 BasicBlock * segmentLoop = b>CreateBasicBlock("segmentLoop"); 147 BasicBlock * segmentDone = b>CreateBasicBlock("segmentDone"); 148 Constant * const ZERO = b>getSize(0); 149 150 Value * pendingItemCount = b>getScalarField("pendingItemCount"); 151 std::vector<Value *> pendingData(mStreamCount); 152 for (unsigned i = 0; i < mStreamCount; i++) { 153 pendingData[i] = b>getScalarField("pendingOutputBlock_" + std::to_string(i)); 154 } 155 156 b>CreateBr(segmentLoop); 157 // Main Loop 158 b>SetInsertPoint(segmentLoop); 159 PHINode * blockOffsetPhi = b>CreatePHI(b>getSizeTy(), 2); 160 PHINode * outputBlockPhi = b>CreatePHI(b>getSizeTy(), 2); 161 PHINode * pendingItemsPhi = b>CreatePHI(b>getSizeTy(), 2); 162 PHINode * pendingDataPhi[mStreamCount]; 163 blockOffsetPhi>addIncoming(ZERO, entry); 164 outputBlockPhi>addIncoming(ZERO, entry); 165 pendingItemsPhi>addIncoming(pendingItemCount, entry); 166 for (unsigned i = 0; i < mStreamCount; i++) { 167 pendingDataPhi[i] = b>CreatePHI(b>getBitBlockType(), 2); 168 pendingDataPhi[i]>addIncoming(pendingData[i], entry); 169 } 170 Value * fieldPopCounts = b>loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi); 171 // For each field determine the (partial) sum popcount of all fields up to and 172 // including the current field. 173 Value * partialSum = fieldPopCounts; 174 for (unsigned i = 1; i < numFields; i *= 2) { 175 partialSum = b>simd_add(fw, partialSum, b>mvmd_slli(fw, partialSum, i)); 176 } 177 Value * blockPopCount = b>CreateZExtOrTrunc(b>CreateExtractElement(partialSum, numFields1), sizeTy); 178 // 179 // Now determine for each source field the output offset of the first bit. 180 // Note that this depends on the number of pending bits. 181 // 182 Value * pendingOffset = b>CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw)); 183 Value * splatPending = b>simd_fill(fw, b>CreateZExtOrTrunc(pendingOffset, fwTy)); 184 Value * pendingFieldIdx = b>CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw)); 185 Value * offsets = b>simd_add(fw, b>mvmd_slli(fw, partialSum, 1), splatPending); 186 offsets = b>simd_and(offsets, fwMaskSplat); // parallel URem fw 187 // 188 // Determine the relative field number for each output field. Note that the total 189 // number of fields involved is numFields + 1. However, the first field always 190 // be immediately combined into the current pending data field, so we calculate 191 // field numbers for all subsequent fields, (the fields that receive overflow bits). 192 Value * fieldNo = b>simd_srli(fw, b>simd_add(fw, partialSum, splatPending), std::log2(fw)); 193 // 194 // Now process the input data block of each stream in the input stream set. 195 // 196 // First load all the stream set blocks and the pending data. 197 std::vector<Value *> sourceBlock(mStreamCount); 198 for (unsigned i = 0; i < mStreamCount; i++) { 199 sourceBlock[i] = b>loadInputStreamBlock("sourceStreamSet", b>getInt32(i), blockOffsetPhi); 200 201 } 202 // Now separate the bits of each field into ones that go into the current field 203 // and ones that go into the overflow field. Extract the first field separately, 204 // and then shift and combine subsequent fields. 205 std::vector<Value *> pendingOutput(mStreamCount); 206 std::vector<Value *> outputFields(mStreamCount); 207 for (unsigned i = 0; i < mStreamCount; i++) { 208 Value * currentFieldBits = b>simd_sllv(fw, sourceBlock[i], offsets); 209 Value * nextFieldBits = b>simd_srlv(fw, sourceBlock[i], b>simd_sub(fw, fwSplat, offsets)); 210 Value * firstField = b>mvmd_extract(fw, currentFieldBits, 0); 211 Value * vec1 = b>CreateInsertElement(zeroSplat, firstField, pendingFieldIdx); 212 pendingOutput[i] = b>simd_or(pendingDataPhi[i], vec1); 213 // shift back currentFieldBits to combine with nextFieldBits. 214 outputFields[i] = b>simd_or(b>mvmd_srli(fw, currentFieldBits, 1), nextFieldBits); 215 } 216 // We may have filled the current field of the pendingOutput update the 217 // pendingFieldIndex. 218 Value * newPendingTotal = b>CreateZExtOrTrunc(b>mvmd_extract(fw, fieldPopCounts, 0), sizeTy); 219 pendingFieldIdx = b>CreateUDiv(newPendingTotal, ConstantInt::get(sizeTy, fw)); 220 // Now combine forward all fields with the same field number. This may require 221 // up to log2 numFields steps. 222 for (unsigned j = 1; j < numFields; j*=2) { 223 Value * select = b>simd_eq(fw, fieldNo, b>mvmd_slli(fw, fieldNo, j)); 224 for (unsigned i = 0; i < mStreamCount; i++) { 225 Value * fields_fwd = b>mvmd_slli(fw, outputFields[i], j); 226 outputFields[i] = b>simd_or(outputFields[i], b>simd_and(select, fields_fwd)); 227 } 228 } 229 // Now compress the data fields, eliminating all but the last field from 230 // each run of consecutive field having the same field number. 231 // same field number as a subsequent field. 232 Value * eqNext = b>simd_eq(fw, fieldNo, b>mvmd_srli(fw, fieldNo, 1)); 233 Value * compressMask = b>hsimd_signmask(fw, b>simd_not(eqNext)); 234 for (unsigned i = 0; i < mStreamCount; i++) { 235 outputFields[i] = b>mvmd_compress(fw, outputFields[i], compressMask); 236 } 237 // 238 // Finally combine the pendingOutput and outputField data. 239 // (a) shift forward outputField data to fill the pendingOutput values. 240 // (b) shift back outputField data to clear data added to pendingOutput. 241 Value * shftBack = b>CreateSub(numFieldConst, pendingFieldIdx); 242 for (unsigned i = 0; i < mStreamCount; i++) { 243 Value * outputFwd = b>mvmd_sll(fw, outputFields[i], pendingFieldIdx); 244 pendingOutput[i] = b>simd_or(pendingOutput[i], outputFwd); 245 outputFields[i] = b>mvmd_srl(fw, outputFields[i], shftBack); 246 } 247 // 248 // Write the pendingOutput data to outputStream. 249 // Note: this data may be overwritten later, but we avoid branching. 250 for (unsigned i = 0; i < mStreamCount; i++) { 251 b>storeOutputStreamBlock("compressedOutput", b>getInt32(i), outputBlockPhi, pendingOutput[i]); 252 } 253 // Now determine the total amount of pending items and whether 254 // the pending data all fits within the pendingOutput. 255 Value * newPending = b>CreateAdd(pendingItemsPhi, blockPopCount); 256 Value * doesFit = b>CreateICmpULT(newPending, bitBlockWidthConst); 257 newPending = b>CreateSelect(doesFit, newPending, b>CreateSub(newPending, bitBlockWidthConst)); 258 // 259 // Prepare Phi nodes for the next iteration. 260 // 261 Value * nextBlk = b>CreateAdd(blockOffsetPhi, b>getSize(1)); 262 blockOffsetPhi>addIncoming(nextBlk, segmentLoop); 263 Value * nextOutputBlk = b>CreateAdd(outputBlockPhi, b>getSize(1)); 264 // But don't advance the output if all the data does fit into pendingOutput. 265 nextOutputBlk = b>CreateSelect(doesFit, outputBlockPhi, nextOutputBlk); 266 outputBlockPhi>addIncoming(nextOutputBlk, segmentLoop); 267 pendingItemsPhi>addIncoming(newPending, segmentLoop); 268 269 for (unsigned i = 0; i < mStreamCount; i++) { 270 pendingOutput[i] = b>CreateSelect(doesFit, b>fwCast(fw, pendingOutput[i]), b>fwCast(fw, outputFields[i])); 271 pendingDataPhi[i]>addIncoming(b>bitCast(pendingOutput[i]), segmentLoop); 272 } 273 // 274 // Now continue the loop if there are more blocks to process. 275 Value * moreToDo = b>CreateICmpNE(nextBlk, numOfBlocks); 276 b>CreateCondBr(moreToDo, segmentLoop, segmentDone); 277 278 b>SetInsertPoint(segmentDone); 279 // Save kernel state. 280 b>setScalarField("pendingItemCount", newPending); 281 for (unsigned i = 0; i < mStreamCount; i++) { 282 b>setScalarField("pendingOutputBlock_" + std::to_string(i), b>bitCast(pendingOutput[i])); 283 } 284 Value * produced = b>getProducedItemCount("compressedOutput"); 285 produced = b>CreateAdd(produced, b>CreateMul(nextOutputBlk, bitBlockWidthConst)); 286 produced = b>CreateSelect(mIsFinal, b>CreateAdd(produced, blockPopCount), produced); 287 b>setProducedItemCount("compressedOutput", produced); 288 } 289 290 291 119 292 120 293 SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
Note: See TracChangeset
for help on using the changeset viewer.