source: icGREP/icgrep-devel/icgrep/kernels/kernel.h @ 5985

Last change on this file since 5985 was 5985, checked in by nmedfort, 12 months ago

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File size: 21.9 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#ifndef KERNEL_H
7#define KERNEL_H
8
9#include "interface.h"
10#include <boost/container/flat_map.hpp>
11
12namespace llvm { class AllocaInst; }
13namespace llvm { class BasicBlock; }
14namespace llvm { class Constant; }
15namespace llvm { class Function; }
16namespace llvm { class IntegerType; }
17namespace llvm { class IndirectBrInst; }
18namespace llvm { class PHINode; }
19namespace llvm { class LoadInst; }
20namespace llvm { class Type; }
21namespace llvm { class Value; }
22namespace parabix { class StreamSetBuffer; }
23
24namespace kernel {
25   
26class KernelBuilder;
27
28class Kernel : public KernelInterface {
29    friend class KernelBuilder;
30public:
31   
32    enum class Port { Input, Output };
33    using StreamPort = std::pair<Port, unsigned>;
34    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
35    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
36    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
37
38    // Kernel Signatures and Module IDs
39    //
40    // A kernel signature uniquely identifies a kernel and its full functionality.
41    // In the event that a particular kernel instance is to be generated and compiled
42    // to produce object code, and we have a cached kernel object code instance with
43    // the same signature and targetting the same IDISA architecture, then the cached
44    // object code may safely be used to avoid recompilation.
45    //
46    // A kernel signature is a byte string of arbitrary length.
47    //
48    // Kernel developers should take responsibility for designing appropriate signature
49    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
50    // based on the semantics of the kernel.
51    //
52    // If no other mechanism is available, the default makeSignature() method uses the
53    // full LLVM IR (before optimization) of the kernel instance.
54    //
55    // A kernel Module ID is short string that is used as a name for a particular kernel
56    // instance.  Kernel Module IDs are used to look up and retrieve cached kernel
57    // instances and so should be highly likely to uniquely identify a kernel instance.
58    //
59    // The ideal case is that a kernel Module ID serves as a full kernel signature thus
60    // guaranteeing uniqueness.  In this case, hasSignature() should return false.
61    //
62
63    //
64    // Kernel builder subtypes define their logic of kernel construction
65    // in terms of 3 virtual methods for
66    // (a) preparing the Kernel state data structure
67    // (c) defining the logic of the finalBlock function.
68    //
69    // Note: the kernel state data structure must only be finalized after
70    // all scalar fields have been added.   If there are no fields to
71    // be added, the default method for preparing kernel state may be used.
72
73
74    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
75
76    // Can the module ID itself serve as the unique signature?
77    virtual bool hasSignature() const { return true; }
78
79    bool isCachable() const override { return false; }
80
81    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
82
83    llvm::Module * setModule(llvm::Module * const module);
84
85    llvm::Module * makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb);
86
87    llvm::Module * getModule() const {
88        return mModule;
89    }
90
91    void generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb);
92   
93    llvm::Value * createInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
94
95    void initializeInstance(const std::unique_ptr<KernelBuilder> & idb) final;
96
97    void finalizeInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
98
99    StreamPort getStreamPort(const std::string & name) const;
100
101    const Binding & getBinding(const std::string & name) const;
102
103    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
104
105    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
106
107    bool requiresCopyBack(const Binding & binding) const;
108
109    bool strideOffsetIsTriviallyCalculable(const Binding & binding) const;
110
111    bool permitsNonLinearAccess(const Binding & binding) const;
112
113    bool requiresLinearAccess(const Binding & binding) const;
114
115    bool anyBindingRequiresLinearSpace() const;
116
117    const StreamSetBuffers & getStreamSetInputBuffers() const {
118        return mStreamSetInputBuffers;
119    }
120
121    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
122        assert (i < mStreamSetInputBuffers.size());
123        assert (mStreamSetInputBuffers[i]);
124        return mStreamSetInputBuffers[i];
125    }
126
127    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
128        const auto port = getStreamPort(name);
129        assert (port.first == Port::Input);
130        return getStreamSetInputBuffer(port.second);
131    }
132
133    const StreamSetBuffers & getStreamSetOutputBuffers() const {
134        return mStreamSetOutputBuffers;
135    }
136
137    const Binding & getStreamInput(const unsigned i) const {
138        return KernelInterface::getStreamInput(i);
139    }
140
141    const Binding & getStreamInput(const std::string & name) const {
142        const auto port = getStreamPort(name);
143        assert (port.first == Port::Input);
144        return KernelInterface::getStreamInput(port.second);
145    }
146
147    const Binding & getStreamInput(const parabix::StreamSetBuffer * const buffer) const {
148        for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
149            if (mStreamSetInputBuffers[i] == buffer) {
150                return getStreamInput(i);
151            }
152        }
153        throw std::runtime_error("no output binding found given buffer");
154    }
155
156    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
157        assert (i < mStreamSetOutputBuffers.size());
158        assert (mStreamSetOutputBuffers[i]);
159        return mStreamSetOutputBuffers[i];
160    }
161
162    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
163        const auto port = getStreamPort(name);
164        assert (port.first == Port::Output);
165        return getStreamSetOutputBuffer(port.second);
166    }
167
168    const Binding & getStreamOutput(const unsigned i) const {
169        return KernelInterface::getStreamOutput(i);
170    }
171
172    const Binding & getStreamOutput(const parabix::StreamSetBuffer * const buffer) const {
173        for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
174            if (mStreamSetOutputBuffers[i] == buffer) {
175                return getStreamOutput(i);
176            }
177        }
178        throw std::runtime_error("no output binding found given buffer");
179    }
180
181    const Binding & getStreamOutput(const std::string & name) const {
182        const auto port = getStreamPort(name);
183        assert (port.first == Port::Output);
184        return KernelInterface::getStreamOutput(port.second);
185    }
186   
187    // Kernels typically perform block-at-a-time processing, but some kernels may require
188    // a different stride.   In the case of multiblock kernels, the stride attribute
189    // determines the number of minimum number of items that will be provided to the kernel
190    // on each doMultiBlock call.
191    //
192   
193    unsigned getStride() const { return mStride; }
194   
195    virtual ~Kernel() = 0;
196
197    void prepareKernel(const std::unique_ptr<KernelBuilder> & b);
198
199    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & b);
200
201    std::string getCacheName(const std::unique_ptr<KernelBuilder> & b) const;
202
203    bool mayTerminateEarly() const {
204        return hasAttribute(kernel::Attribute::KindId::CanTerminateEarly);
205    }
206
207    bool mustExplicitlyTerminate() const {
208        return hasAttribute(kernel::Attribute::KindId::MustExplicitlyTerminate);
209    }
210
211protected:
212
213    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> &) { }
214
215    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
216
217    // Constructor
218    Kernel(std::string && kernelName, Bindings && stream_inputs,
219          Bindings && stream_outputs,
220          Bindings && scalar_parameters,
221          Bindings && scalar_outputs,
222          Bindings && internal_scalars);
223
224    unsigned getScalarIndex(const std::string & name) const;
225
226    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
227
228    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> &) { }
229   
230    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> &) = 0;
231
232    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> &) { }
233
234    // Add an additional scalar field to the KernelState struct.
235    // Must occur before any call to addKernelDeclarations or createKernelModule.
236    unsigned addScalar(llvm::Type * type, const std::string & name);
237
238    unsigned addUnnamedScalar(llvm::Type * type);
239
240    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & b);
241
242    void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b);
243
244    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b);
245
246    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
247        unsigned index; Port port;
248        std::tie(port, index) = getStreamPort(name);
249        if (port == Port::Input) {
250            return getStreamSetInputBuffer(index);
251        } else {
252            return getStreamSetOutputBuffer(index);
253        }
254    }
255
256    void setStride(unsigned stride) { mStride = stride; }
257
258    bool treatUnsafeKernelOperationsAsErrors() const { return mTreatUnsafeKernelOperationsAsErrors; }
259
260    bool mustClearOverflowPriorToCopyback(const Binding & binding) const;
261
262private:
263
264    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & b);
265
266    llvm::Value * getAvailableItemCount(const unsigned i) const {
267        assert (i < mAvailableItemCount.size());
268        return mAvailableItemCount[i];
269    }
270
271    bool verifyBufferSize(const Binding & binding, const parabix::StreamSetBuffer * const buffer) const;
272
273    void verifyStreamSetDefinitions() const;
274
275protected:
276
277    llvm::Function *                    mCurrentMethod; 
278    unsigned                            mStride;
279    bool                                mTreatUnsafeKernelOperationsAsErrors;
280    llvm::Value *                       mIsFinal;
281    llvm::Value *                       mOutputScalarResult;
282    mutable bool                        mIsGenerated;
283
284    std::vector<llvm::Value *>          mAvailableItemCount;
285
286    KernelFieldMap                      mKernelFieldMap;
287    std::vector<llvm::Type *>           mKernelFields;
288
289    StreamMap                           mStreamMap;
290
291    StreamSetBuffers                    mStreamSetInputBuffers;
292    StreamSetBuffers                    mStreamSetOutputBuffers;
293};
294
295using Kernels = std::vector<Kernel *>;
296
297class SegmentOrientedKernel : public Kernel {
298protected:
299
300    SegmentOrientedKernel(std::string && kernelName,
301                          Bindings && stream_inputs,
302                          Bindings && stream_outputs,
303                          Bindings && scalar_parameters,
304                          Bindings && scalar_outputs,
305                          Bindings && internal_scalars);
306protected:
307
308    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
309
310    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
311
312};
313
314/*
315The Multi-Block Kernel Builder
316------------------------------
317
318The Multi-Block Kernel Builder is designed to simplify the programming of
319efficient kernels with possibly variable and/or nonaligned output, subject to
320exact or MaxRatio processing constraints.   The following restrictions apply.
321
322#.  The input consists of one or more stream sets, the first of which is
323    known as the principal input stream set.
324
325#.  If there is more than one input stream set, the additional stream sets
326    are first classified as having either a derived processing rate or
327    a variable processing rate.   Stream sets with a derived processing rate
328    have a processing rate defined with respect to the input stream set of one
329    of the following types:  FixedRate, Add1 or RoundUp.    Note that stream sets
330    declared without a processing rate attribute have the FixedRate(1) attribute
331    by default and therefore satisfy this constraint.  All other processing rate
332    types are classified as variable rate.
333
334#.  All output stream sets must be declared with processing rate attributes
335    of one of the following types:
336    *  FixedRate, Add1, Roundup, or MaxRatio with respect to the principal input stream set.
337    *  FixedRate with respect to some other output stream set.
338
339    When using the Multi-Block Kernel Builder to program a new type of kernel,
340    the programmer must implement the generateDoMultiBlockMethod for normal
341    multi-block processing according to the requirements below, as well as
342    providing for special final block processing, if necessary.
343
344#.  The doMultiBlockMethod will be called with the following parameters:
345    * the number of items of the principal input stream to process (itemsToDo),
346    * additional items available parameters for each additional input stream set
347      that is classified as a variable rate stream set
348    * pointers to linear contiguous buffer areas for each of the input stream sets, and
349    * pointers to linear contiguous output buffer areas for each of the output stream sets.
350
351    Notes:
352    * if the kernel has a Lookahead dependency declared on any input stream set, then
353      there will be two buffer pointers for that stream set, one for accessing stream set
354      items without lookahead and one for accessing the items with lookahead.   
355    * pointers are to the beginning of the block corresponding to the
356      processedItemCount or producedItemCount of the given stream set.
357    * the base type of each pointer is the StreamSetBlockType of that streamset
358
359#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
360    processed under the following simplifying assumptions.
361    * the number of itemsToDo will either be an exact multiple of the kernel stride,
362      or, for processing the final block, a value less than the kernel stride
363    * the input buffer of the principal stream set and all input buffers of stream sets
364      with derived processing rates will be safe to access and have data available in
365      accord with their processing rates based on the given number of itemsToDo
366      of the principal input stream set; no further bounds checking is needed. 
367    * input buffers of stream sets with MaxRatio attributes will be safe to access,
368      but will only have valid data as specified by the available items parameter for
369      that stream set.
370    * the kernel programmer is responsible for safe access and bounds checking for any
371      input stream set classified as Unknown rate.   No temporary buffers are used
372      for such stream sets.
373    * all output buffers will be safe to access and have space available
374      for the given maximum output generation rates based on the given number
375      of itemsToDo of the principal input stream set; no further bounds checking
376      is needed.
377    * for final block processing, all input buffers will be extended to be safely
378      treated as containing data corresponding to a full block of the principal
379      input stream set, with the actual data in each buffer padded with null values
380      beyond the end of input.  Similarly, all output buffers will contain space
381      sufficient for the maximum output that can be generated for a full block of
382      input processing.
383    * input and output pointers will be typed to allow convenient and logical access
384      to corresponding streams based on their declared stream set type and processing rate.
385    * for any input pointer p, a GEP instruction with a single int32 index i
386      will produce a pointer to the buffer position corresponding to the ith block of the
387      input stream set, relative to the initial block based on the processedItemCount.
388    * for any output stream set declared with a Fixed or Add1 processing rate with respect
389      to the principal input stream set, a GEP instruction with a single int32 index i
390      will produce a pointer to the buffer position corresponding to the ith block of the
391      stream set, relative to the initial block based on the producedItemCount.
392
393#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
394    processed and produced item counts are updated for all stream sets that have exact
395    processing rate attributes.   Programmers are responsible for updating the counts
396    of any stream set declared with a variable attribute (MaxRatio or Unknown).
397
398#.  An important caveat is that buffer areas may change arbitrarily between
399    calls to the doMultiBlockMethod.   In no case should a kernel store a
400    buffer pointer in its internal state.   Furthermore a kernel must not make
401    any assumptions about the accessibility of stream set data outside of the
402    processing range outside of the block boundaries associated with the given itemsToDo.
403*/
404
405class MultiBlockKernel : public Kernel {
406    friend class BlockOrientedKernel;
407protected:
408
409    MultiBlockKernel(std::string && kernelName,
410                     Bindings && stream_inputs,
411                     Bindings && stream_outputs,
412                     Bindings && scalar_parameters,
413                     Bindings && scalar_outputs,
414                     Bindings && internal_scalars);
415
416    // Each multi-block kernel subtype must provide its own logic for handling
417    // doMultiBlock calls, subject to the requirements laid out above.
418    // The generateMultiBlockLogic must be written to generate this logic, given
419    // a created but empty function.  Upon entry to generateMultiBlockLogic,
420    // the builder insertion point will be set to the entry block; upone
421    // exit the RetVoid instruction will be added to complete the method.
422    //
423    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
424
425private:
426
427    // Given a kernel subtype with an appropriate interface, the generateDoSegment
428    // method of the multi-block kernel builder makes all the necessary arrangements
429    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
430    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
431
432    void writeMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b);
433
434    void checkInputStream(const std::unique_ptr<KernelBuilder> & b, const unsigned index);
435
436    void checkOutputStream(const std::unique_ptr<KernelBuilder> & b, const unsigned index);
437
438    llvm::Value * computePopCountRate(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate, llvm::Value * const maxItems);
439
440    llvm::Value * getPopCountRateItems(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate, llvm::Value * const strideIndex);
441
442    void prepareOverflowBuffers(const std::unique_ptr<KernelBuilder> & b);
443
444    void writeCopyBackLogic(const std::unique_ptr<KernelBuilder> & b);
445
446    void calculateDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
447
448    void updateDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
449
450    llvm::Value * hasAnotherStride(const std::unique_ptr<KernelBuilder> & b);
451
452    void updateFinalDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
453
454    void checkTerminationSignal(const std::unique_ptr<KernelBuilder> & b);
455
456    static bool hasDerivedItemCount(const Binding & binding);
457
458protected:
459
460    llvm::Value *                   mInitiallyFinal;
461    llvm::Value *                   mNumOfStrides;
462    llvm::Value *                   mNumOfStridesInFinalSegment;
463
464    std::vector<llvm::Value *>      mInitialAvailableItemCount;
465
466    std::vector<llvm::Value *>      mInitialProcessedItemCount;
467    std::vector<llvm::Value *>      mAccessibleInputItems;
468    std::vector<llvm::Value *>      mInputStrideLength;
469    std::vector<llvm::Value *>      mPopCountRateArray;
470
471    std::vector<llvm::Value *>      mInitialProducedItemCount;
472    std::vector<llvm::Value *>      mWritableOutputItems;
473    std::vector<llvm::Value *>      mOutputStrideLength;
474};
475
476
477class BlockOrientedKernel : public MultiBlockKernel {
478protected:
479
480    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
481
482    // Each kernel builder subtype must provide its own logic for generating
483    // doBlock calls.
484    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
485
486    // Each kernel builder subtypre must also specify the logic for processing the
487    // final block of stream data, if there is any special processing required
488    // beyond simply calling the doBlock function.   In the case that the final block
489    // processing may be trivially implemented by dispatching to the doBlock method
490    // without additional preparation, the default generateFinalBlockMethod need
491    // not be overridden.
492
493    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
494
495    BlockOrientedKernel(std::string && kernelName,
496                        Bindings && stream_inputs,
497                        Bindings && stream_outputs,
498                        Bindings && scalar_parameters,
499                        Bindings && scalar_outputs,
500                        Bindings && internal_scalars);
501
502    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
503
504private:
505
506    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
507
508    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
509
510    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
511
512    llvm::Value * incrementDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
513
514private:
515
516    llvm::Function *            mDoBlockMethod;
517    llvm::BasicBlock *          mStrideLoopBody;
518    llvm::IndirectBrInst *      mStrideLoopBranch;
519    llvm::PHINode *             mStrideLoopTarget;
520    llvm::PHINode *             mStrideBlockIndex;
521};
522
523}
524
525#endif
Note: See TracBrowser for help on using the repository browser.