source: icGREP/icgrep-devel/icgrep/pablo/carrypack_manager.cpp @ 5613

Last change on this file since 5613 was 5510, checked in by nmedfort, 2 years ago

Back up check-in. Should have no effect on current programs.

File size: 57.7 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "carrypack_manager.h"
8#include <pablo/carry_data.h>
9#include <pablo/codegenstate.h>
10#include <llvm/IR/BasicBlock.h>
11#include <llvm/IR/DerivedTypes.h>
12#include <pablo/branch.h>
13#include <pablo/pe_advance.h>
14#include <pablo/pe_scanthru.h>
15#include <pablo/pe_matchstar.h>
16#include <pablo/pe_var.h>
17#include <kernels/kernel_builder.h>
18#include <toolchain/toolchain.h>
19#include <array>
20
21#include <llvm/Support/CommandLine.h>
22
23#include <llvm/Support/raw_ostream.h>
24
25using namespace llvm;
26
27namespace pablo {
28
29inline static bool is_power_2(const unsigned n) {
30    return (n && ((n & (n - 1)) == 0));
31}
32
33inline static unsigned ceil_log2(const unsigned v) {
34    assert ("log2(0) is undefined!" && v != 0);
35    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
36}
37
38inline static unsigned floor_log2(const unsigned v) {
39    assert ("log2(0) is undefined!" && v != 0);
40    return ((sizeof(unsigned) * CHAR_BIT) - 1U) - __builtin_clz(v);
41}
42
43inline static unsigned nearest_pow2(const unsigned v) {
44    assert (v > 0 && v < (UINT32_MAX / 2));
45    return (v < 2) ? 1 : (1U << ceil_log2(v));
46}
47
48inline static unsigned nearest_multiple(const unsigned n, const unsigned m) {
49    assert (is_power_2(m));
50    return (n + m - 1U) & ~(m - 1U);
51}
52
53inline static bool is_multiple_of(const unsigned n, const unsigned m) {
54    return nearest_multiple(n, m) == n;
55}
56
57inline static unsigned udiv(const unsigned x, const unsigned y) {
58    assert (is_power_2(y));
59    const unsigned z = x >> floor_log2(y);
60    assert (z == (x / y));
61    return z;
62}
63
64inline static unsigned ceil_udiv(const unsigned x, const unsigned y) {
65    assert (is_power_2(x) && is_power_2(y));
66    return udiv(((x - 1U) | (y - 1U)) + 1U, y);
67}
68
69inline static unsigned getElementWidth(Type * ty) {
70    if (LLVM_LIKELY(isa<VectorType>(ty))) {
71        ty = cast<IntegerType>(ty->getVectorElementType());
72    }
73    return cast<IntegerType>(ty)->getBitWidth();
74}
75
76inline static unsigned gcd(unsigned a, unsigned b) {
77  while (a) {
78     assert (is_power_2(a));
79     const unsigned c = a;
80     a = b & (a - 1);
81     b = c;
82  }
83  return b;
84}
85
86inline static unsigned getPackingSize(const unsigned elementWidth, const unsigned depth) {
87    return std::max<unsigned>(elementWidth / nearest_pow2(depth + 1), 1);
88}
89
90using TypeId = PabloAST::ClassTypeId;
91
92#define ELEMENTS_IN_DYNAMICALLY_ALLOCATED_CARRY_STRUCT 3
93
94/** ------------------------------------------------------------------------------------------------------------- *
95 * @brief initializeCarryData
96 ** ------------------------------------------------------------------------------------------------------------- */
97void CarryManager::initializeCarryData(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, PabloKernel * const kernel) {
98
99    // Each scope constructs its own CarryData struct, which will be added to the final "carries" struct
100    // that is added to the Kernel. The scope index will indicate which struct to access.
101
102    // A CarryData struct either contains an array of CarryPackBlocks or an integer indicating the capacity of
103    // the variable length CarryData struct and pointer. A variable length CarryData struct is required whenever
104    // the streams accessed by a loop could vary between iterations. When resizing a CarryData struct for a
105    // particular loop, the current loop struct and all nested structs need to be resized. This accommodates
106    // the fact every pablo While loop must be executed at least once.
107
108    // A nested loop may also contain a variable length CarryData struct
109
110    // To determine whether we require a variable length CarryData struct, we test the escaped variables of
111    // each loop branch to see whether they are used as the index parameter of a nested Extract statement.
112    // Any scope that requires variable length CarryData, requires that all nested branches have a unique
113    // set of carries for that iteration.
114
115    assert (mKernel == nullptr);
116
117    mCurrentScope = kernel->getEntryBlock();
118    mKernel = kernel;
119
120    Type * const carryTy = iBuilder->getBitBlockType();
121    mVectorWidth = carryTy->getPrimitiveSizeInBits();
122    mElementWidth = getElementWidth(carryTy);
123    assert (is_power_2(mElementWidth));
124
125    mCarryScopes = 0;
126    mCarryMetadata.resize(getScopeCount(mCurrentScope));
127    mCarryGroup.resize(assignDefaultCarryGroups(kernel->getEntryBlock()));
128
129    Type * const carryStateTy = analyse(iBuilder, kernel->getEntryBlock());
130
131    kernel->addScalar(carryStateTy, "carries");
132
133//    iBuilder->CallPrintInt("carry state size:", ConstantExpr::getSizeOf(carryStateTy));
134
135    if (mHasLoop) {
136        kernel->addScalar(iBuilder->getInt32Ty(), "selector");
137    }
138    if (mHasLongAdvance) {
139        kernel->addScalar(iBuilder->getSizeTy(), "CarryBlockIndex");
140    }
141}
142
143bool isDynamicallyAllocatedType(const Type * const ty) {
144    if (isa<StructType>(ty) && ty->getStructNumElements() == ELEMENTS_IN_DYNAMICALLY_ALLOCATED_CARRY_STRUCT) {
145        return (ty->getStructElementType(1)->isPointerTy() && ty->getStructElementType(2)->isPointerTy() && ty->getStructElementType(0)->isIntegerTy());
146    }
147    return false;
148}
149
150bool containsDynamicallyAllocatedType(const Type * const ty) {
151    if (isa<StructType>(ty)) {
152        for (unsigned i = 0; i < ty->getStructNumElements(); ++i) {
153            if (isDynamicallyAllocatedType(ty->getStructElementType(i))) {
154                return true;
155            }
156        }
157    }
158    return false;
159}
160
161void freeDynamicallyAllocatedMemory(const std::unique_ptr<kernel::KernelBuilder> & idb, Value * const frame) {
162    StructType * const ty = cast<StructType>(frame->getType()->getPointerElementType());
163    std::array<Value *, 3> indices;
164    indices[0] = idb->getInt32(0);
165    for (unsigned i = 0; i < ty->getStructNumElements(); ++i) {
166        if (isDynamicallyAllocatedType(ty->getStructElementType(i))) {
167            indices[1] = idb->getInt32(i);
168            indices[2] = idb->getInt32(1);
169            Value * const innerFrame = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
170            if (containsDynamicallyAllocatedType(innerFrame->getType())) {
171                indices[2] = indices[0];
172                Value *  const count = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
173                BasicBlock * const entry = idb->GetInsertBlock();
174                BasicBlock * const cond = idb->CreateBasicBlock("freeCarryDataCond");
175                BasicBlock * const body = idb->CreateBasicBlock("freeCarryDataLoop");
176                BasicBlock * const exit = idb->CreateBasicBlock("freeCarryDataExit");
177                idb->CreateBr(cond);
178                idb->SetInsertPoint(cond);
179                PHINode * const index = idb->CreatePHI(count->getType(), 2);
180                index->addIncoming(ConstantInt::getNullValue(count->getType()), entry);
181                Value * test = idb->CreateICmpNE(index, count);
182                idb->CreateCondBr(test, body, exit);
183                idb->SetInsertPoint(body);
184                freeDynamicallyAllocatedMemory(idb, idb->CreateGEP(innerFrame, index));
185                index->addIncoming(idb->CreateAdd(index, ConstantInt::get(count->getType(), 1)), body);
186                idb->CreateBr(cond);
187                idb->SetInsertPoint(exit);
188            }
189            idb->CreateFree(innerFrame);
190            indices[2] = idb->getInt32(2);
191            Value *  const summary = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
192            idb->CreateFree(summary);
193        }
194    }
195}
196
197/** ------------------------------------------------------------------------------------------------------------- *
198 * @brief releaseCarryData
199 ** ------------------------------------------------------------------------------------------------------------- */
200void CarryManager::releaseCarryData(const std::unique_ptr<kernel::KernelBuilder> & idb) {
201    if (mHasNonCarryCollapsingLoops) {
202        freeDynamicallyAllocatedMemory(idb, idb->getScalarFieldPtr("carries"));
203    }
204}
205
206/** ------------------------------------------------------------------------------------------------------------- *
207 * @brief initializeCodeGen
208 ** ------------------------------------------------------------------------------------------------------------- */
209void CarryManager::initializeCodeGen(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
210
211    assert(!mCarryMetadata.empty());
212    mCarryInfo = &mCarryMetadata[0];
213    assert (!mCarryInfo->hasSummary());
214
215    mCurrentFrame = iBuilder->getScalarFieldPtr("carries");
216    mCurrentFrameIndex = 0;
217    mCarryScopes = 0;
218    mCarryScopeIndex.push_back(0);
219
220    assert (mCurrentFrameOffset.empty());
221
222    mCurrentFrameOffset.push_back(iBuilder->getInt32(0));
223
224    assert (mNonCarryCollapsingLoopCarryFrameStack.empty());
225
226    assert (mCarrySummaryStack.empty());
227
228    Type * const carryTy = iBuilder->getBitBlockType();
229
230    mCarrySummaryStack.push_back(Constant::getNullValue(carryTy));
231
232    if (mHasLoop) {       
233        mLoopSelector[0] = iBuilder->getScalarField("selector");
234        mLoopSelector[1] = iBuilder->CreateXor(mLoopSelector[0], ConstantInt::get(mLoopSelector[0]->getType(), 1));
235    }
236
237}
238
239/** ------------------------------------------------------------------------------------------------------------- *
240 * @brief finalizeCodeGen
241 ** ------------------------------------------------------------------------------------------------------------- */
242void CarryManager::finalizeCodeGen(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
243    if (mHasLoop) {
244        iBuilder->setScalarField("selector", mLoopSelector[1]);
245    }
246    if (mHasLongAdvance) {
247        Value * idx = iBuilder->getScalarField("CarryBlockIndex");
248        idx = iBuilder->CreateAdd(idx, iBuilder->getSize(1));
249        iBuilder->setScalarField("CarryBlockIndex", idx);
250    }
251    assert (mNonCarryCollapsingLoopCarryFrameStack.empty());
252    assert ("base summary value was deleted!" && mCarrySummaryStack.size() == 1);
253    assert ("base summary value was overwritten with non-zero value!" && isa<Constant>(mCarrySummaryStack[0]) && cast<Constant>(mCarrySummaryStack[0])->isNullValue());
254    mCarrySummaryStack.clear();
255    assert (mCarryScopeIndex.size() == 1);
256    mCarryScopeIndex.clear();
257}
258
259/** ------------------------------------------------------------------------------------------------------------- *
260 * @brief enterLoopScope
261 ** ------------------------------------------------------------------------------------------------------------- */
262void CarryManager::enterLoopScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
263    assert (scope);
264    assert (mHasLoop);
265    ++mLoopDepth;
266    enterScope(iBuilder, scope);
267}
268
269/** ------------------------------------------------------------------------------------------------------------- *
270 * @brief enterLoopBody
271 ** ------------------------------------------------------------------------------------------------------------- */
272void CarryManager::enterLoopBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const entryBlock) {
273    if (mCarryInfo->hasSummary()) {
274        Type * const carryTy = iBuilder->getBitBlockType();
275        PHINode * phiCarryOutSummary = iBuilder->CreatePHI(carryTy, 2, "summary");
276        assert (!mCarrySummaryStack.empty());
277        phiCarryOutSummary->addIncoming(mCarrySummaryStack.back(), entryBlock);
278        // Replace the incoming carry summary with the phi node and add the phi node to the stack  so that we can
279        // properly OR it into the outgoing summary value.
280        // NOTE: this may change the base summary value; when exiting to the base scope, replace this summary with
281        // a null value to prevent subsequent nested scopes from inheriting the summary of this scope.
282        mCarrySummaryStack.back() = phiCarryOutSummary;
283        mCarrySummaryStack.push_back(phiCarryOutSummary);
284    }
285    if (LLVM_UNLIKELY(mCarryInfo->nonCarryCollapsingMode())) {
286
287        assert (mCarryInfo->hasSummary());
288
289        Type * const int8PtrTy = iBuilder->getInt8PtrTy();
290        Type * const carryTy = iBuilder->getBitBlockType();
291        PointerType * const carryPtrTy = carryTy->getPointerTo();
292
293        // Check whether we need to resize the carry state
294        PHINode * index = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
295        mLoopIndicies.push_back(index);
296        index->addIncoming(iBuilder->getSize(0), entryBlock);
297        Value * capacityPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
298        Value * capacity = iBuilder->CreateLoad(capacityPtr, "capacity");
299        Constant * const ONE = ConstantInt::get(capacity->getType(), 1);
300        Value * arrayPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(1)});
301        Value * array = iBuilder->CreateLoad(arrayPtr, "array");
302        BasicBlock * const entry = iBuilder->GetInsertBlock();
303        BasicBlock * const resizeCarryState = iBuilder->CreateBasicBlock("ResizeCarryState");
304        BasicBlock * const reallocExisting = iBuilder->CreateBasicBlock("ReallocExisting");
305        BasicBlock * const createNew = iBuilder->CreateBasicBlock("CreateNew");
306        BasicBlock * const resumeKernel = iBuilder->CreateBasicBlock("ResumeKernel");
307        iBuilder->CreateLikelyCondBr(iBuilder->CreateICmpNE(index, capacity), resumeKernel, resizeCarryState);
308
309        // RESIZE CARRY BLOCK
310        iBuilder->SetInsertPoint(resizeCarryState);
311        const auto BlockWidth = iBuilder->getBitBlockWidth() / 8;
312        const auto Log2BlockWidth = floor_log2(BlockWidth);
313        Constant * const carryStateWidth = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(array->getType()->getPointerElementType()), iBuilder->getSizeTy(), false);
314        Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(2)});
315        Value * const hasCarryState = iBuilder->CreateICmpNE(array, ConstantPointerNull::get(cast<PointerType>(array->getType())));
316        iBuilder->CreateLikelyCondBr(hasCarryState, reallocExisting, createNew);
317
318        // REALLOCATE EXISTING
319        iBuilder->SetInsertPoint(reallocExisting);
320        Value * const capacitySize = iBuilder->CreateMul(capacity, carryStateWidth);
321        Value * const newCapacitySize = iBuilder->CreateShl(capacitySize, 1); // x 2
322        Value * const newArray = iBuilder->CreateCacheAlignedMalloc(newCapacitySize);
323        iBuilder->CreateMemCpy(newArray, array, capacitySize, iBuilder->getCacheAlignment());
324        iBuilder->CreateFree(array);
325        iBuilder->CreateStore(newArray, arrayPtr);
326        Value * const startNewArrayPtr = iBuilder->CreateGEP(iBuilder->CreatePointerCast(newArray, int8PtrTy), capacitySize);
327        iBuilder->CreateMemZero(startNewArrayPtr, capacitySize, BlockWidth);
328        Value * const newCapacity = iBuilder->CreateShl(capacity, 1);
329        iBuilder->CreateStore(newCapacity, capacityPtr);
330        Value * const summary = iBuilder->CreateLoad(summaryPtr, false);
331        Value * const summarySize = iBuilder->CreateShl(iBuilder->CreateAdd(iBuilder->CreateCeilLog2(capacity), ONE), Log2BlockWidth + 1);
332        Constant * const additionalSpace = iBuilder->getSize(2 * BlockWidth);
333        Value * const newSummarySize = iBuilder->CreateAdd(summarySize, additionalSpace);
334        Value * const newSummary = iBuilder->CreateBlockAlignedMalloc(newSummarySize);
335        iBuilder->CreateMemCpy(newSummary, summary, summarySize, BlockWidth);
336        iBuilder->CreateFree(summary);
337        iBuilder->CreateStore(iBuilder->CreatePointerCast(newSummary, carryPtrTy), summaryPtr);
338        Value * const startNewSummaryPtr = iBuilder->CreateGEP(iBuilder->CreatePointerCast(newSummary, int8PtrTy), summarySize);
339        iBuilder->CreateMemZero(startNewSummaryPtr, additionalSpace, BlockWidth);
340        iBuilder->CreateBr(resumeKernel);
341
342        // CREATE NEW
343        iBuilder->SetInsertPoint(createNew);
344        Constant * const initialLog2Capacity = iBuilder->getInt64(4);
345        Constant * const initialCapacity = ConstantExpr::getShl(ONE, initialLog2Capacity);
346        iBuilder->CreateStore(initialCapacity, capacityPtr);
347        Constant * const initialCapacitySize = ConstantExpr::getMul(initialCapacity, carryStateWidth);
348        Value * initialArray = iBuilder->CreateCacheAlignedMalloc(initialCapacitySize);
349        iBuilder->CreateMemZero(initialArray, initialCapacitySize, BlockWidth);
350        initialArray = iBuilder->CreatePointerCast(initialArray, array->getType());
351        iBuilder->CreateStore(initialArray, arrayPtr);
352        Constant * initialSummarySize = ConstantExpr::getShl(ConstantExpr::getAdd(initialLog2Capacity, iBuilder->getInt64(1)), iBuilder->getInt64(Log2BlockWidth + 1));
353        Value * initialSummary = iBuilder->CreateBlockAlignedMalloc(initialSummarySize);
354        iBuilder->CreateMemZero(initialSummary, initialSummarySize, BlockWidth);
355        initialSummary = iBuilder->CreatePointerCast(initialSummary, carryPtrTy);
356        iBuilder->CreateStore(initialSummary, summaryPtr);
357        iBuilder->CreateBr(resumeKernel);
358
359        // RESUME KERNEL
360        iBuilder->SetInsertPoint(resumeKernel);
361        PHINode * phiArrayPtr = iBuilder->CreatePHI(array->getType(), 3);
362        phiArrayPtr->addIncoming(array, entry);
363        phiArrayPtr->addIncoming(initialArray, createNew);
364        phiArrayPtr->addIncoming(newArray, reallocExisting);
365        mNonCarryCollapsingLoopCarryFrameStack.emplace_back(mCurrentFrame, std::vector<Value *>{mCurrentFrameOffset.begin(), mCurrentFrameOffset.end()});
366        mCurrentFrame = iBuilder->CreateGEP(phiArrayPtr, index);
367        assert (&std::get<1>(mNonCarryCollapsingLoopCarryFrameStack.back()) != &mCurrentFrameOffset);
368        mCurrentFrameOffset.resize(1);
369    }
370}
371
372/** ------------------------------------------------------------------------------------------------------------- *
373 * @brief leaveLoopBody
374 ** ------------------------------------------------------------------------------------------------------------- */
375void CarryManager::leaveLoopBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * /* exitBlock */) {
376
377    Type * const carryTy = iBuilder->getBitBlockType();
378
379    if (LLVM_UNLIKELY(mCarryInfo->nonCarryCollapsingMode())) {
380
381        assert (mCarryInfo->hasSummary());
382
383        ConstantInt * const summaryIndex = iBuilder->getInt32(mCarryInfo->hasExplicitSummary() ? mCurrentFrameIndex : (mCurrentFrameIndex - 1));
384
385        Value * const carryInAccumulator = readCarryInSummary(iBuilder, summaryIndex);
386        Value * const carryOutAccumulator = mCarrySummaryStack.back();
387
388        if (mCarryInfo->hasExplicitSummary()) {
389            writeCarryOutSummary(iBuilder, carryOutAccumulator, summaryIndex);
390        }
391        std::tie(mCurrentFrame, mCurrentFrameOffset) = mNonCarryCollapsingLoopCarryFrameStack.back();
392        mNonCarryCollapsingLoopCarryFrameStack.pop_back();
393
394        // In non-carry-collapsing mode, we cannot rely on the fact that performing a single iteration of this
395        // loop will consume all of the incoming carries from the prior block. We need to subtract the carries
396        // consumed by this iteration from our carry summary state. To do so in parallel, we use the the half-
397        // subtractor circuit to do it in ceil log2 steps. Similarly, we compute our carry out summary state
398        // (for the subsequent block to subtract) using a half-adder circuit.
399
400        // NOTE: this requires that, for all loop iterations, i, and all block iterations, j, the carry in
401        // summary, CI_i,j, matches the carry out summary of the prior block iteration, CO_i,j - 1.
402        // Otherwise we will end up with an incorrect result or being trapped in an infinite loop.
403
404        Value * capacityPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
405        Value * capacity = iBuilder->CreateLoad(capacityPtr, false);
406        Value * summaryPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(2)});
407        Value * summary = iBuilder->CreateLoad(summaryPtr, false);
408
409        Constant * const ONE = ConstantInt::get(capacity->getType(), 1);
410
411        Value * loopSelector = iBuilder->CreateZExt(mLoopSelector[0], capacity->getType());
412
413        BasicBlock * entry = iBuilder->GetInsertBlock();
414        BasicBlock * update = iBuilder->CreateBasicBlock("UpdateNonCarryCollapsingSummary");
415        BasicBlock * resume = iBuilder->CreateBasicBlock("ResumeAfterUpdatingNonCarryCollapsingSummary");
416
417        iBuilder->CreateBr(update);
418
419        iBuilder->SetInsertPoint(update);
420        PHINode * i = iBuilder->CreatePHI(capacity->getType(), 2);
421        i->addIncoming(ConstantInt::getNullValue(capacity->getType()), entry);
422        PHINode * const borrow = iBuilder->CreatePHI(carryInAccumulator->getType(), 2);
423        borrow->addIncoming(carryInAccumulator, entry);
424        PHINode * const carry = iBuilder->CreatePHI(carryOutAccumulator->getType(), 2);
425        carry->addIncoming(carryOutAccumulator, entry);
426        // OR the updated carry in summary later for the summaryTest
427        PHINode * const carryInSummary = iBuilder->CreatePHI(carryTy, 2);
428        carryInSummary->addIncoming(Constant::getNullValue(carryTy), entry);
429
430        // half subtractor
431        Value * const carryInOffset = iBuilder->CreateOr(iBuilder->CreateShl(i, 1), loopSelector);
432        Value * const carryInPtr = iBuilder->CreateGEP(summary, carryInOffset);
433        Value * const carryIn = iBuilder->CreateBlockAlignedLoad(carryInPtr);
434        Value * const nextCarryIn = iBuilder->CreateXor(carryIn, borrow);
435        Value * const nextSummary = iBuilder->CreateOr(carryInSummary, nextCarryIn);
436        iBuilder->CreateBlockAlignedStore(nextCarryIn, carryInPtr);
437        carryInSummary->addIncoming(nextSummary, update);
438        Value * finalBorrow = iBuilder->CreateAnd(iBuilder->CreateNot(carryIn), borrow);
439        borrow->addIncoming(finalBorrow, update);
440
441        // half adder
442        Value * const carryOutOffset = iBuilder->CreateXor(carryInOffset, ConstantInt::get(carryInOffset->getType(), 1));
443        Value * const carryOutPtr = iBuilder->CreateGEP(summary, carryOutOffset);
444        Value * const carryOut = iBuilder->CreateBlockAlignedLoad(carryOutPtr);
445        Value * const nextCarryOut = iBuilder->CreateXor(carryOut, carry);
446        iBuilder->CreateBlockAlignedStore(nextCarryOut, carryOutPtr);
447        Value * finalCarry = iBuilder->CreateAnd(carryOut, carry);
448        carry->addIncoming(finalCarry, update);
449
450        // loop condition
451        i->addIncoming(iBuilder->CreateAdd(i, ONE), update);
452        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(iBuilder->CreateShl(ONE, i), capacity), update, resume);
453
454        iBuilder->SetInsertPoint(resume);
455
456        if (codegen::EnableAsserts) {
457            iBuilder->CreateAssertZero(iBuilder->CreateOr(finalBorrow, finalCarry),
458                                       "CarryPackManager: loop post-condition violated: final borrow and carry must be zero!");
459        }
460
461        assert (!mLoopIndicies.empty());
462        PHINode * index = mLoopIndicies.back();
463        index->addIncoming(iBuilder->CreateAdd(index, iBuilder->getSize(1)), resume);
464        mLoopIndicies.pop_back();
465
466        mNextSummaryTest = nextSummary;
467    }
468    if (mCarryInfo->hasSummary()) {
469        const auto n = mCarrySummaryStack.size(); assert (n > 1);
470        Value * carryOut = mCarrySummaryStack.back();
471        mCarrySummaryStack.pop_back();
472        PHINode * phiCarryOut = cast<PHINode>(mCarrySummaryStack.back());
473        phiCarryOut->addIncoming(carryOut, iBuilder->GetInsertBlock());
474        // If we're returning to the base scope, reset our accumulated summary value.
475        if (n == 2) {
476            carryOut = Constant::getNullValue(carryTy);
477        }
478        mCarrySummaryStack.back() = carryOut;
479    }
480}
481
482/** ------------------------------------------------------------------------------------------------------------- *
483 * @brief leaveLoopScope
484 ** ------------------------------------------------------------------------------------------------------------- */
485void CarryManager::leaveLoopScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const /* entryBlock */, BasicBlock * const /* exitBlock */) {
486    assert (mLoopDepth > 0);
487    --mLoopDepth;
488    leaveScope(iBuilder);
489}
490
491/** ------------------------------------------------------------------------------------------------------------- *
492 * @brief enterIfScope
493 ** ------------------------------------------------------------------------------------------------------------- */
494void CarryManager::enterIfScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
495    ++mIfDepth;
496    enterScope(iBuilder, scope);
497    // We zero-initialized the nested summary value and later OR in the current summary into the escaping summary
498    // so that upon processing the subsequent block iteration, we branch into this If scope iff a carry out was
499    // generated by a statement within this If scope and not by a dominating statement in the outer scope.
500    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
501        assert (mCurrentFrameIndex == 0);
502        mNextSummaryTest = readCarryInSummary(iBuilder, iBuilder->getInt32(0));
503        if (mCarryInfo->hasExplicitSummary()) {
504            mCurrentFrameIndex = 1;
505        }
506    }
507    Type * const carryTy = iBuilder->getBitBlockType();
508    mCarrySummaryStack.push_back(Constant::getNullValue(carryTy));
509}
510
511/** ------------------------------------------------------------------------------------------------------------- *
512 * @brief generateSummaryTest
513 ** ------------------------------------------------------------------------------------------------------------- */
514Value * CarryManager::generateSummaryTest(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * condition) {
515    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
516        assert ("summary test was not generated" && mNextSummaryTest);
517        condition = iBuilder->simd_or(condition, mNextSummaryTest);
518        mNextSummaryTest = nullptr;
519    }
520    assert ("summary test was not consumed" && (mNextSummaryTest == nullptr));
521    return condition;
522}
523
524/** ------------------------------------------------------------------------------------------------------------- *
525 * @brief enterIfBody
526 ** ------------------------------------------------------------------------------------------------------------- */
527void CarryManager::enterIfBody(const std::unique_ptr<kernel::KernelBuilder> & /* iBuilder */, BasicBlock * const entryBlock) {
528    assert (entryBlock);
529}
530
531/** ------------------------------------------------------------------------------------------------------------- *
532 * @brief leaveIfBody
533 ** ------------------------------------------------------------------------------------------------------------- */
534void CarryManager::leaveIfBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const exitBlock) {
535    assert (exitBlock);
536    const auto n = mCarrySummaryStack.size();
537    if (LLVM_LIKELY(mCarryInfo->hasExplicitSummary())) {
538        writeCarryOutSummary(iBuilder, mCarrySummaryStack[n - 1], iBuilder->getInt32(0));
539    }
540    if (n > 2) {
541        mCarrySummaryStack[n - 1] = iBuilder->CreateOr(mCarrySummaryStack[n - 1], mCarrySummaryStack[n - 2], "summary");
542    }
543}
544
545/** ------------------------------------------------------------------------------------------------------------- *
546 * @brief leaveIfScope
547 ** ------------------------------------------------------------------------------------------------------------- */
548void CarryManager::leaveIfScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const entryBlock, BasicBlock * const exitBlock) {
549    assert (mIfDepth > 0);
550    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
551        const auto n = mCarrySummaryStack.size(); assert (n > 0);
552        if (n > 2) {
553            // When leaving a nested If scope with a summary value, phi out the summary to ensure the
554            // appropriate summary is stored in the outer scope.
555            Value * nested = mCarrySummaryStack[n - 1];
556            Value * outer = mCarrySummaryStack[n - 2];
557            assert (nested->getType() == outer->getType());
558            PHINode * const phi = iBuilder->CreatePHI(nested->getType(), 2, "summary");
559            phi->addIncoming(outer, entryBlock);
560            phi->addIncoming(nested, exitBlock);
561            mCarrySummaryStack[n - 2] = phi;
562        }
563    }
564    --mIfDepth;
565    leaveScope(iBuilder);
566    mCarrySummaryStack.pop_back();
567}
568
569/** ------------------------------------------------------------------------------------------------------------ *
570 * @brief enterScope
571 ** ------------------------------------------------------------------------------------------------------------- */
572void CarryManager::enterScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
573    assert (scope);
574    // Store the state of the current frame and update the scope state
575    mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex));
576    mCurrentScope = scope;
577    mCarryScopeIndex.push_back(++mCarryScopes);
578    mCarryInfo = &mCarryMetadata[mCarryScopes];
579    mCurrentFrameIndex = 0;
580}
581
582/** ------------------------------------------------------------------------------------------------------------- *
583 * @brief leaveScope
584 ** ------------------------------------------------------------------------------------------------------------- */
585void CarryManager::leaveScope(const std::unique_ptr<kernel::KernelBuilder> & /* iBuilder */) {
586
587    assert (mCurrentFrameOffset.size() > 1);
588    ConstantInt * const carryIndex = cast<ConstantInt>(mCurrentFrameOffset.back());
589    mCurrentFrameIndex = carryIndex->getLimitedValue() + 1;
590    mCurrentFrameOffset.pop_back();
591
592    mCarryScopeIndex.pop_back();
593    assert (!mCarryScopeIndex.empty());
594    mCurrentScope = mCurrentScope->getPredecessor();
595    assert (mCurrentScope);
596    mCarryInfo = &mCarryMetadata[mCarryScopeIndex.back()];
597}
598
599/** ------------------------------------------------------------------------------------------------------------- *
600 * @brief addCarryInCarryOut
601 ** ------------------------------------------------------------------------------------------------------------- */
602Value * CarryManager::addCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op, Value * const e1, Value * const e2) {
603    assert (op && !isa<Advance>(op));
604    Value * const carryIn = getCarryIn(iBuilder, op);
605    Value * carryOut, * result;
606    std::tie(carryOut, result) = iBuilder->bitblock_add_with_carry(e1, e2, carryIn);
607    setCarryOut(iBuilder, op, carryOut);
608    assert (result->getType() == iBuilder->getBitBlockType());
609    return result;
610}
611
612/** ------------------------------------------------------------------------------------------------------------- *
613 * @brief advanceCarryInCarryOut
614 ** ------------------------------------------------------------------------------------------------------------- */
615Value * CarryManager::advanceCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const Advance * const advance, Value * const value) {
616    const auto shiftAmount = advance->getAmount();
617    if (LLVM_LIKELY(shiftAmount < mElementWidth)) {
618        Value * const carryIn = getCarryIn(iBuilder, advance);
619        Value * carryOut, * result;
620        std::tie(carryOut, result) = iBuilder->bitblock_advance(value, carryIn, shiftAmount);
621        setCarryOut(iBuilder, advance, carryOut);
622        assert (result->getType() == iBuilder->getBitBlockType());
623        return result;
624    } else {
625        return longAdvanceCarryInCarryOut(iBuilder, value, shiftAmount);
626    }
627}
628
629/** ------------------------------------------------------------------------------------------------------------- *
630 * @brief longAdvanceCarryInCarryOut
631 ** ------------------------------------------------------------------------------------------------------------- */
632inline Value * CarryManager::longAdvanceCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const value, const unsigned shiftAmount) {
633
634    assert (mHasLongAdvance);
635    assert (shiftAmount >= mElementWidth);
636
637    Type * const streamTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
638
639    if (mIfDepth > 0) {
640        if (shiftAmount > iBuilder->getBitBlockWidth()) {
641
642            mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex++));
643            mCurrentFrameOffset.push_back(nullptr);
644
645            Value * carry = iBuilder->CreateZExt(iBuilder->bitblock_any(value), streamTy);
646            const unsigned summarySize = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth() * iBuilder->getBitBlockWidth());
647            for (unsigned i = 0;;++i) {
648
649                mCurrentFrameOffset.back() = iBuilder->getInt32(i);
650
651                Value * const ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
652                Value * const prior = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(ptr), streamTy);
653                Value * const stream = iBuilder->CreateBitCast(iBuilder->CreateOr(iBuilder->CreateShl(prior, 1), carry), iBuilder->getBitBlockType());
654                if (LLVM_LIKELY(i == summarySize)) {
655                    Value * const maskedStream = iBuilder->CreateAnd(stream, iBuilder->bitblock_mask_from(iBuilder->getInt32(summarySize % iBuilder->getBitBlockWidth())));
656                    addToCarryOutSummary(iBuilder, maskedStream);
657                    iBuilder->CreateBlockAlignedStore(maskedStream, ptr);
658                    break;
659                }
660                addToCarryOutSummary(iBuilder, stream);
661                iBuilder->CreateBlockAlignedStore(stream, ptr);
662                carry = iBuilder->CreateLShr(prior, iBuilder->getBitBlockWidth() - 1);
663            }
664
665            mCurrentFrameOffset.pop_back();
666            mCurrentFrameOffset.pop_back();
667
668        } else if (LLVM_LIKELY(mCarryInfo->hasExplicitSummary())) {
669            addToCarryOutSummary(iBuilder, value);
670        }
671    }
672    mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex++));
673    Value * result = nullptr;
674    // special case using a single buffer entry and the carry_out value.
675    if (LLVM_LIKELY((shiftAmount < iBuilder->getBitBlockWidth()) && (mLoopDepth == 0))) {
676        mCurrentFrameOffset.push_back(iBuilder->getInt32(0));
677        Value * const buffer = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
678        assert (buffer->getType()->getPointerElementType() == iBuilder->getBitBlockType());
679        Value * carryIn = iBuilder->CreateBlockAlignedLoad(buffer);
680        iBuilder->CreateBlockAlignedStore(value, buffer);
681        /* Very special case - no combine */
682        if (LLVM_UNLIKELY(shiftAmount == iBuilder->getBitBlockWidth())) {
683            return iBuilder->CreateBitCast(carryIn, iBuilder->getBitBlockType());
684        }
685        Value * block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - shiftAmount);
686        Value * block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(value, streamTy), shiftAmount);
687        result = iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
688    } else { //
689        const unsigned blockShift = shiftAmount % iBuilder->getBitBlockWidth();
690        const unsigned blocks = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth());
691        // Create a mask to implement circular buffer indexing
692        Value * const indexMask = iBuilder->getSize(nearest_pow2(blocks + ((mLoopDepth != 0) ? 1 : 0)) - 1);
693        Value * const blockIndex = iBuilder->getScalarField("CarryBlockIndex");
694        Value * const carryIndex0 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks));
695        Value * const loadIndex0 = iBuilder->CreateAnd(carryIndex0, indexMask);
696        mCurrentFrameOffset.push_back(loadIndex0);
697
698        Value * const carryInPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
699        Value * const carryIn = iBuilder->CreateBlockAlignedLoad(carryInPtr);
700        Value * const storeIndex = iBuilder->CreateAnd(blockIndex, indexMask);
701        mCurrentFrameOffset.back() = storeIndex;
702
703        Value * const carryOutPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
704        assert (carryIn->getType() == iBuilder->getBitBlockType());
705        // If the long advance is an exact multiple of BitBlockWidth, we simply return the oldest
706        // block in the long advance carry data area.
707        if (LLVM_UNLIKELY(blockShift == 0)) {
708            iBuilder->CreateBlockAlignedStore(value, carryOutPtr);
709            result = carryIn;
710        } else { // Otherwise we need to combine data from the two oldest blocks.
711            Value * carryIndex1 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks - 1));
712            Value * loadIndex1 = iBuilder->CreateAnd(carryIndex1, indexMask);
713            mCurrentFrameOffset.back() = loadIndex1;
714            Value * const carryInPtr2 = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
715            Value * carry_block1 = iBuilder->CreateBlockAlignedLoad(carryInPtr2);
716            Value * block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - blockShift);
717            Value * block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(carry_block1, streamTy), blockShift);
718            iBuilder->CreateBlockAlignedStore(value, carryOutPtr);
719            result = iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
720        }
721    }
722    mCurrentFrameOffset.pop_back();
723    mCurrentFrameOffset.pop_back();
724    return result;
725}
726
727/** ------------------------------------------------------------------------------------------------------------- *
728 * @brief getNextCarryIn
729 ** ------------------------------------------------------------------------------------------------------------- */
730Value * CarryManager::getCarryIn(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op) {
731
732    VectorType * const carryTy = iBuilder->getBitBlockType();
733
734    const auto carryGroup = op->getCarryGroup();
735    assert (carryGroup < mCarryGroup.size());
736    CarryGroup & group = mCarryGroup[carryGroup];
737
738    Value * carryIn = group.carryIn;
739    if (carryIn) {
740        // if this carry crosses a non-carry-collapsing loop frame, compute the address for this carry in the 'new' frame.
741        Value * const frame = cast<GetElementPtrInst>(cast<LoadInst>(carryIn)->getPointerOperand())->getPointerOperand();
742        carryIn = (frame == mCurrentFrame) ? carryIn : nullptr;
743    }
744
745    if (carryIn == nullptr) {
746        mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex));
747        if (LLVM_UNLIKELY(mLoopDepth != 0)) {
748            assert ("Loop selector was not initialized!" && mLoopSelector[0]);
749            mCurrentFrameOffset.push_back(mLoopSelector[0]);
750        }
751        Value * ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
752        carryIn = iBuilder->CreateBlockAlignedLoad(ptr);
753        carryIn = iBuilder->CreateBitCast(carryIn, carryTy);
754
755        group.carryIn = carryIn;
756        group.carryOut = nullptr;
757        group.frameIndex = mCurrentFrameIndex++;
758        group.packedSize = 0;
759
760        // If we're within a loop, we need to clear out the carry in value for the subsequent loop iteration
761        if (LLVM_UNLIKELY(mLoopDepth != 0)) {
762            iBuilder->CreateBlockAlignedStore(Constant::getNullValue(carryTy), ptr);
763            mCurrentFrameOffset.pop_back();
764        }
765        mCurrentFrameOffset.pop_back();
766        if (LLVM_UNLIKELY(group.groupSize == 1)) {
767            return carryIn;
768        }
769    }
770
771    // Initially, our carry in is densely packed. Our goal is to move the carry to the 0-th position and leave
772    // all other values zeroed out. We can do so easily using ShuffleVector by moving the appropriate element
773    // and selecting 0s for all other elements but this produces poor code by LLVM.
774
775    // Instead we divide this process into two stages:
776
777    // First use the blend instruction to select only the elements of the carry we want
778
779
780
781    const auto width = std::max<unsigned>(gcd(std::min<unsigned>(op->getCarryWidth(), mElementWidth), group.packedSize), 8);
782    const auto n = udiv(iBuilder->getBitBlockWidth(), width);
783
784    VectorType * packTy = VectorType::get(iBuilder->getIntNTy(width), n);
785
786    Constant * const zero = Constant::getNullValue(packTy);
787
788    carryIn = iBuilder->CreateBitCast(carryIn, packTy);
789
790    if (LLVM_UNLIKELY(group.groupSize > 1)) {
791        int blend_mask[n];
792        for (unsigned i = 0; i < n; ++i) {
793            const auto w = i * width;
794            if (w >= group.packedSize && w < (group.packedSize + op->getCarryWidth())) {
795                blend_mask[i] = i;
796            } else {
797                blend_mask[i] = i + n;
798            }
799        }
800        carryIn = iBuilder->CreateShuffleVector(carryIn, zero, ArrayRef<int>(blend_mask, n));
801    }
802
803    // Then use a byte shift to move them to the 0-th position
804
805    const auto offset = udiv(group.packedSize, width);
806
807    if (offset) {
808        int shift[n];
809        for (unsigned i = 0; i < n; ++i) {
810            shift[i] = offset + i;
811        }
812        carryIn = iBuilder->CreateShuffleVector(carryIn, zero, ArrayRef<int>(shift, n));
813    }
814
815    // if this element contains more than one carry, we need to mask mask off any subsequent carries
816    const unsigned nextPackedSize = group.packedSize + op->getCarryWidth();
817    const unsigned subsequentOffset = (-nextPackedSize) & (width - 1U);
818    if (LLVM_UNLIKELY(subsequentOffset != 0)) {
819        carryIn = iBuilder->CreateShl(carryIn, subsequentOffset);
820    }
821
822    // then shift out the prior carries while moving the current carry to the correct position
823    const unsigned trailingOffset = subsequentOffset + (group.packedSize & (width - 1U));
824    assert (trailingOffset < width);
825    if (LLVM_UNLIKELY(trailingOffset != 0)) {
826        carryIn = iBuilder->CreateLShr(carryIn, trailingOffset);
827    }
828
829    carryIn = iBuilder->CreateBitCast(carryIn, carryTy, "CarryIn_" + op->getName());
830
831    return carryIn;
832}
833
834/** ------------------------------------------------------------------------------------------------------------- *
835 * @brief setNextCarryOut
836 ** ------------------------------------------------------------------------------------------------------------- */
837void CarryManager::setCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op, Value * carryOut) {
838
839    VectorType * const carryTy = iBuilder->getBitBlockType();
840
841    assert ("Unexpected carry out value type!" && carryOut->getType()->canLosslesslyBitCastTo(carryTy));
842
843    const auto carryGroup = op->getCarryGroup();
844    assert (carryGroup < mCarryGroup.size());
845    CarryGroup & group = mCarryGroup[carryGroup];
846
847    const auto width = std::max<unsigned>(gcd(op->getCarryWidth(), group.packedSize), 8);
848    const auto count = udiv(iBuilder->getBitBlockWidth(), width);
849    VectorType * packTy = VectorType::get(iBuilder->getIntNTy(width), count);
850
851    carryOut = iBuilder->CreateBitCast(carryOut, packTy);
852
853    // shuffle the carryOut to have the 0-th slot being in the correct packed offset
854    const auto index = udiv(group.packedSize, width);
855
856    if (index) {
857        assert (index < count);
858        int position[count];
859        for (unsigned i = 0; i < count; ++i) {
860            position[i] = ((count - index) + i);
861        }
862        Constant * const zero = Constant::getNullValue(packTy);
863        carryOut = iBuilder->CreateShuffleVector(zero, carryOut, ArrayRef<int>(position, count));
864    }
865
866    const auto offset = group.packedSize & (width - 1U);
867    if (LLVM_UNLIKELY(offset != 0)) {
868        carryOut = iBuilder->CreateShl(carryOut, offset);
869    }
870    carryOut = iBuilder->CreateBitCast(carryOut, carryTy);
871
872    assert (op->getCarryWidth() > 0);
873
874    if (group.carryOut == nullptr) {
875        group.carryOut = carryOut;
876    } else {
877        #ifndef NDEBUG
878        iBuilder->CreateAssertZero(iBuilder->CreateAnd(group.carryOut, carryOut),
879                               "CarryPackManager: bit collision detected when combining partial carry outs");
880        #endif
881        group.carryOut = iBuilder->CreateOr(group.carryOut, carryOut);
882    }
883
884    group.packedSize += op->getCarryWidth();
885
886    if (LLVM_UNLIKELY(group.groupSize == 1)) {
887        Value * ptr = nullptr;
888        if (LLVM_LIKELY(mLoopDepth == 0)) {
889            ptr = cast<LoadInst>(group.carryIn)->getPointerOperand();
890        } else {
891            mCurrentFrameOffset.push_back(iBuilder->getInt32(group.frameIndex));
892            mCurrentFrameOffset.push_back(mLoopSelector[1]);
893            ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
894            mCurrentFrameOffset.pop_back();
895            mCurrentFrameOffset.pop_back();
896            if (LLVM_LIKELY(!mCarryInfo->nonCarryCollapsingMode())) {
897                group.carryOut = iBuilder->CreateOr(group.carryOut, iBuilder->CreateBlockAlignedLoad(ptr));
898            }
899        }
900
901        if (mCarryInfo->hasSummary()) {
902            // TODO: if this group is all within a single scope, only add the packed carry to the summary
903            addToCarryOutSummary(iBuilder, group.carryOut);
904        }
905
906        iBuilder->CreateBlockAlignedStore(group.carryOut, ptr);
907    }
908
909    group.groupSize -= 1;
910
911}
912
913/** ------------------------------------------------------------------------------------------------------------- *
914 * @brief readCarryInSummary
915 ** ------------------------------------------------------------------------------------------------------------- */
916Value * CarryManager::readCarryInSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, ConstantInt * index) {
917    assert (mCarryInfo->hasSummary());
918    const auto size = mCurrentFrameOffset.size();
919    // If we have a borrowed summary, we cannot be sure how deeply nested it is but do know that it will be the first
920    // element of a nested struct that is not a struct. Traverse inwards until we find it.
921    if (LLVM_UNLIKELY(mCarryInfo->hasBorrowedSummary())) {
922        Type * frameTy = mCurrentFrame->getType()->getPointerElementType();
923        for (unsigned i = 1; i < mCurrentFrameOffset.size(); ++i) {
924            frameTy = frameTy->getStructElementType(cast<ConstantInt>(mCurrentFrameOffset[i])->getLimitedValue());
925        }
926        assert (frameTy->isStructTy());
927        unsigned count = 1;
928        for (;;) {
929            frameTy = frameTy->getStructElementType(0);
930            if (!frameTy->isStructTy()) {
931                break;
932            }
933            ++count;
934        }
935        assert (count > 0);
936        mCurrentFrameOffset.insert(mCurrentFrameOffset.end(), count - 1, iBuilder->getInt32(0));
937    }
938    mCurrentFrameOffset.push_back(index);
939    if (mLoopDepth != 0) {
940        mCurrentFrameOffset.push_back(mLoopSelector[0]);
941    }
942    Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
943    mCurrentFrameOffset.resize(size);
944    Value * const summary = iBuilder->CreateBlockAlignedLoad(summaryPtr);
945    if (mLoopDepth != 0 && mCarryInfo->hasExplicitSummary()) {
946        Type * const carryTy = iBuilder->getBitBlockType();
947        iBuilder->CreateBlockAlignedStore(Constant::getNullValue(carryTy), summaryPtr);
948    }
949    return summary;
950}
951
952/** ------------------------------------------------------------------------------------------------------------- *
953 * @brief writeCarryOutSummary
954 ** ------------------------------------------------------------------------------------------------------------- */
955inline void CarryManager::writeCarryOutSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const summary, ConstantInt * index) {
956    assert (mCarryInfo->hasExplicitSummary());
957    mCurrentFrameOffset.push_back(index);
958    if (mLoopDepth != 0) {
959        mCurrentFrameOffset.push_back(mLoopSelector[1]);
960    }
961    Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
962    iBuilder->CreateBlockAlignedStore(summary, summaryPtr);
963    if (mLoopDepth != 0) {
964        mCurrentFrameOffset.pop_back();
965    }
966    mCurrentFrameOffset.pop_back();
967}
968
969/** ------------------------------------------------------------------------------------------------------------- *
970 * @brief addToCarryOutSummary
971 ** ------------------------------------------------------------------------------------------------------------- */
972inline void CarryManager::addToCarryOutSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const value) {
973    assert ("cannot add null summary value!" && value);   
974    assert ("summary stack is empty!" && !mCarrySummaryStack.empty());
975    assert ("current scope does not have a summary!" && mCarryInfo->hasSummary());
976    Value * const currentSummary = mCarrySummaryStack.back();
977    mCarrySummaryStack.back() = iBuilder->CreateOr(value, currentSummary);
978}
979
980/** ------------------------------------------------------------------------------------------------------------- *
981 * @brief enumerate
982 ** ------------------------------------------------------------------------------------------------------------- */
983unsigned CarryManager::getScopeCount(const PabloBlock * const scope, unsigned index) {
984    for (const Statement * stmt : *scope) {
985        if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
986            index = getScopeCount(cast<Branch>(stmt)->getBody(), index);
987        }
988    }
989    return index + 1;
990}
991
992/** ------------------------------------------------------------------------------------------------------------- *
993 * @brief hasIterationSpecificAssignment
994 ** ------------------------------------------------------------------------------------------------------------- */
995bool CarryManager::hasIterationSpecificAssignment(const PabloBlock * const scope) {
996#if 0
997    return dyn_cast_or_null<While>(scope->getBranch()) != nullptr;
998#else
999    if (const While * const br = dyn_cast_or_null<While>(scope->getBranch())) {
1000        for (const Var * var : br->getEscaped()) {
1001            for (const PabloAST * user : var->users()) {
1002                if (const Extract * e = dyn_cast<Extract>(user)) {
1003                    if (LLVM_UNLIKELY(e->getIndex() == var)) {
1004                        // If we assign this Var a value and read the value as the index parameter
1005                        // of a nested Extract statement, then we cannot collapse the carries.
1006                        const PabloBlock * parent = e->getParent();
1007                        for (;;) {
1008                            if (parent == scope) {
1009                                return true;
1010                            }
1011                            parent = parent->getPredecessor();
1012                            if (parent == nullptr) {
1013                                break;
1014                            }
1015                        }
1016                    }
1017                }
1018            }
1019        }
1020    }
1021    return false;
1022#endif
1023}
1024
1025/** ------------------------------------------------------------------------------------------------------------- *
1026 * @brief assignDefaultCarryGroups
1027 ** ------------------------------------------------------------------------------------------------------------- */
1028unsigned CarryManager::assignDefaultCarryGroups(PabloBlock * const scope, const unsigned ifDepth, const unsigned loopDepth, unsigned carryGroups) {
1029    const auto packingSize = getPackingSize(mVectorWidth, (ifDepth + loopDepth));
1030    unsigned packedWidth = 0;
1031    for (Statement * stmt : *scope) {
1032        if (LLVM_UNLIKELY(isa<CarryProducingStatement>(stmt))) {
1033            unsigned amount = 1;
1034            bool canPack = true;
1035            if (LLVM_LIKELY(isa<Advance>(stmt))) {
1036                amount = cast<Advance>(stmt)->getAmount();
1037                canPack = (amount < mElementWidth);
1038            }
1039            if (packedWidth == 0) {
1040                ++carryGroups;
1041            }
1042            if (LLVM_LIKELY(canPack)) {
1043                amount = nearest_multiple(amount, packingSize);
1044                // if we cannot insert this carry into the current pack, start a new group
1045                if ((packedWidth + amount) > mVectorWidth) {
1046                    ++carryGroups;
1047                    packedWidth = amount;
1048                } else {
1049                    packedWidth += amount;
1050                }
1051            } else if (LLVM_LIKELY(packedWidth != 0)) {
1052                ++carryGroups;
1053                packedWidth = 0;
1054            }
1055            assert (carryGroups > 0);
1056            cast<CarryProducingStatement>(stmt)->setCarryGroup(carryGroups - 1);
1057            cast<CarryProducingStatement>(stmt)->setCarryWidth(amount);
1058        } else if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
1059            const auto nestedIfDepth = ifDepth + isa<If>(stmt) ? 1 : 0;
1060            const auto nestedLoopDepth = loopDepth + isa<If>(stmt) ? 0 : 1;
1061            carryGroups = assignDefaultCarryGroups(cast<Branch>(stmt)->getBody(), nestedIfDepth, nestedLoopDepth, carryGroups);
1062            packedWidth = 0;
1063        }
1064    }
1065    return carryGroups;
1066}
1067
1068
1069/** ------------------------------------------------------------------------------------------------------------- *
1070 * @brief analyse
1071 ** ------------------------------------------------------------------------------------------------------------- */
1072StructType * CarryManager::analyse(const std::unique_ptr<kernel::KernelBuilder> & iBuilder,
1073                                   const PabloBlock * const scope,
1074                                   const unsigned ifDepth, const unsigned loopDepth,
1075                                   const bool isNestedWithinNonCarryCollapsingLoop) {
1076
1077    assert ("scope cannot be null!" && scope);
1078    assert ("the entry scope -- and only the entry scope -- must be in carry scope 0"
1079            && (mCarryScopes == 0 ? (scope == mKernel->getEntryBlock()) : (scope != mKernel->getEntryBlock())));
1080    assert (mCarryScopes < mCarryMetadata.size());
1081
1082    Type * const carryTy = iBuilder->getBitBlockType();
1083
1084    CarryData & cd = mCarryMetadata[mCarryScopes++];
1085    const bool nonCarryCollapsingMode = hasIterationSpecificAssignment(scope);
1086    Type * const carryPackTy = (loopDepth == 0) ? carryTy : ArrayType::get(carryTy, 2);
1087    std::vector<Type *> state;
1088
1089    for (const Statement * stmt : *scope) {
1090        if (LLVM_UNLIKELY(isa<CarryProducingStatement>(stmt))) {
1091            const auto index = cast<CarryProducingStatement>(stmt)->getCarryGroup();
1092            assert (index < mCarryGroup.size());
1093            CarryGroup & carryGroup = mCarryGroup[index];
1094            if (carryGroup.groupSize == 0) {
1095                Type * packTy = carryPackTy;
1096                if (LLVM_LIKELY(isa<Advance>(stmt))) {
1097                    const auto amount = cast<Advance>(stmt)->getAmount();
1098                    if (LLVM_UNLIKELY(amount >= mElementWidth)) {
1099                        if (LLVM_UNLIKELY(ifDepth > 0 && amount > iBuilder->getBitBlockWidth())) {
1100                            // 1 bit will mark the presense of any bit in each block.
1101                            state.push_back(ArrayType::get(carryTy, ceil_udiv(amount, std::pow(iBuilder->getBitBlockWidth(), 2))));
1102                        }
1103                        mHasLongAdvance = true;
1104                        const auto blocks = ceil_udiv(amount, iBuilder->getBitBlockWidth());
1105                        packTy = ArrayType::get(carryTy, nearest_pow2(blocks + ((loopDepth != 0) ? 1 : 0)));
1106                    }
1107                }
1108                state.push_back(packTy);
1109            }
1110            carryGroup.groupSize += 1;
1111        } else if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
1112            unsigned nestedIfDepth = ifDepth;
1113            unsigned nestedLoopDepth = loopDepth;
1114            if (LLVM_LIKELY(isa<If>(stmt))) {
1115                ++nestedIfDepth;
1116            } else {
1117                mHasLoop = true;
1118                ++nestedLoopDepth;
1119            }
1120            state.push_back(analyse(iBuilder, cast<Branch>(stmt)->getBody(),
1121                                    nestedIfDepth, nestedLoopDepth, nonCarryCollapsingMode | isNestedWithinNonCarryCollapsingLoop));
1122        }
1123    }
1124    // Build the carry state struct and add the summary pack if needed.
1125    StructType * carryState = nullptr;
1126    CarryData::SummaryType summaryType = CarryData::NoSummary;
1127    if (LLVM_UNLIKELY(state.empty())) {
1128        carryState = StructType::get(iBuilder->getContext());
1129    } else {
1130        if (dyn_cast_or_null<If>(scope->getBranch()) || nonCarryCollapsingMode || isNestedWithinNonCarryCollapsingLoop) {
1131            if (LLVM_LIKELY(state.size() > 1)) {
1132                summaryType = CarryData::ExplicitSummary;
1133                // NOTE: summaries are stored differently depending whether we're entering an If or While branch. With an If branch, they
1134                // preceed the carry state data and with a While loop they succeed it. This is to help cache prefectching performance.
1135                state.insert(isa<If>(scope->getBranch()) ? state.begin() : state.end(), carryPackTy);
1136            } else {
1137                summaryType = CarryData::ImplicitSummary;
1138                if (state[0]->isStructTy()) {
1139                    summaryType = CarryData::BorrowedSummary;
1140                }
1141            }           
1142        }
1143        carryState = StructType::get(iBuilder->getContext(), state);
1144        // If we're in a loop and cannot use collapsing carry mode, convert the carry state struct into a capacity,
1145        // carry state pointer, and summary pointer struct.
1146        if (LLVM_UNLIKELY(nonCarryCollapsingMode)) {
1147            mHasNonCarryCollapsingLoops = true;
1148            carryState = StructType::get(iBuilder->getSizeTy(), carryState->getPointerTo(), carryTy->getPointerTo(), nullptr);
1149            assert (isDynamicallyAllocatedType(carryState));
1150        }
1151        cd.setNonCollapsingCarryMode(nonCarryCollapsingMode);
1152    }
1153    cd.setSummaryType(summaryType);
1154
1155    return carryState;
1156}
1157
1158/** ------------------------------------------------------------------------------------------------------------- *
1159 * @brief constructor
1160 ** ------------------------------------------------------------------------------------------------------------- */
1161CarryManager::CarryManager() noexcept
1162: mKernel(nullptr)
1163, mVectorWidth(0)
1164, mElementWidth(0)
1165, mCurrentFrame(nullptr)
1166, mCurrentFrameIndex(0)
1167, mCurrentScope(nullptr)
1168, mCarryInfo(nullptr)
1169, mNextSummaryTest(nullptr)
1170, mIfDepth(0)
1171, mHasLongAdvance(false)
1172, mHasNonCarryCollapsingLoops(false)
1173, mHasLoop(false)
1174, mLoopDepth(0)
1175, mLoopSelector{nullptr, nullptr}
1176, mCarryScopes(0) {
1177
1178}
1179
1180}
Note: See TracBrowser for help on using the repository browser.