source: icGREP/icgrep-devel/icgrep/kernels/kernel.h @ 5816

Last change on this file since 5816 was 5793, checked in by nmedfort, 21 months ago

Bug fix for pipeline: it was terminating too early when there was insufficient output space to process all of the input for a kernel.

File size: 21.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#ifndef KERNEL_H
7#define KERNEL_H
8
9#include "interface.h"
10#include <boost/container/flat_map.hpp>
11
12namespace llvm { class BasicBlock; }
13namespace llvm { class Constant; }
14namespace llvm { class Function; }
15namespace llvm { class IntegerType; }
16namespace llvm { class IndirectBrInst; }
17namespace llvm { class PHINode; }
18namespace llvm { class LoadInst; }
19namespace llvm { class Type; }
20namespace llvm { class Value; }
21namespace parabix { class StreamSetBuffer; }
22
23namespace kernel {
24   
25class KernelBuilder;
26
27class Kernel : public KernelInterface {
28    friend class KernelBuilder;
29protected:
30
31    static const std::string DO_BLOCK_SUFFIX;
32    static const std::string FINAL_BLOCK_SUFFIX;
33    static const std::string MULTI_BLOCK_SUFFIX;
34    static const std::string LOGICAL_SEGMENT_NO_SCALAR;
35    static const std::string PROCESSED_ITEM_COUNT_SUFFIX;
36    static const std::string CONSUMED_ITEM_COUNT_SUFFIX;
37    static const std::string PRODUCED_ITEM_COUNT_SUFFIX;
38    static const std::string TERMINATION_SIGNAL;
39    static const std::string BUFFER_PTR_SUFFIX;
40    static const std::string CONSUMER_SUFFIX;
41    static const std::string CYCLECOUNT_SCALAR;
42
43public:
44   
45    enum class Port { Input, Output };
46    using StreamPort = std::pair<Port, unsigned>;
47    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
48    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
49    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
50
51    // Kernel Signatures and Module IDs
52    //
53    // A kernel signature uniquely identifies a kernel and its full functionality.
54    // In the event that a particular kernel instance is to be generated and compiled
55    // to produce object code, and we have a cached kernel object code instance with
56    // the same signature and targetting the same IDISA architecture, then the cached
57    // object code may safely be used to avoid recompilation.
58    //
59    // A kernel signature is a byte string of arbitrary length.
60    //
61    // Kernel developers should take responsibility for designing appropriate signature
62    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
63    // based on the semantics of the kernel.
64    //
65    // If no other mechanism is available, the default makeSignature() method uses the
66    // full LLVM IR (before optimization) of the kernel instance.
67    //
68    // A kernel Module ID is short string that is used as a name for a particular kernel
69    // instance.  Kernel Module IDs are used to look up and retrieve cached kernel
70    // instances and so should be highly likely to uniquely identify a kernel instance.
71    //
72    // The ideal case is that a kernel Module ID serves as a full kernel signature thus
73    // guaranteeing uniqueness.  In this case, hasSignature() should return false.
74    //
75
76    //
77    // Kernel builder subtypes define their logic of kernel construction
78    // in terms of 3 virtual methods for
79    // (a) preparing the Kernel state data structure
80    // (c) defining the logic of the finalBlock function.
81    //
82    // Note: the kernel state data structure must only be finalized after
83    // all scalar fields have been added.   If there are no fields to
84    // be added, the default method for preparing kernel state may be used.
85
86
87    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
88
89    // Can the module ID itself serve as the unique signature?
90    virtual bool hasSignature() const { return true; }
91
92    bool isCachable() const override { return false; }
93
94    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
95
96    llvm::Module * setModule(llvm::Module * const module);
97
98    llvm::Module * makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb);
99
100    llvm::Module * getModule() const {
101        return mModule;
102    }
103
104    void generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb);
105   
106    llvm::Value * createInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
107
108    void initializeInstance(const std::unique_ptr<KernelBuilder> & idb) final;
109
110    void finalizeInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
111
112    StreamPort getStreamPort(const std::string & name) const;
113
114    const Binding & getBinding(const std::string & name) const;
115
116    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
117
118    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
119
120    const StreamSetBuffers & getStreamSetInputBuffers() const {
121        return mStreamSetInputBuffers;
122    }
123
124    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
125        assert (i < mStreamSetInputBuffers.size());
126        assert (mStreamSetInputBuffers[i]);
127        return mStreamSetInputBuffers[i];
128    }
129
130    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
131        const auto port = getStreamPort(name);
132        assert (port.first == Port::Input);
133        return getStreamSetInputBuffer(port.second);
134    }
135
136    const StreamSetBuffers & getStreamSetOutputBuffers() const {
137        return mStreamSetOutputBuffers;
138    }
139
140    const Binding & getStreamInput(const unsigned i) const {
141        return KernelInterface::getStreamInput(i);
142    }
143
144    const Binding & getStreamInput(const std::string & name) const {
145        const auto port = getStreamPort(name);
146        assert (port.first == Port::Input);
147        return KernelInterface::getStreamInput(port.second);
148    }
149
150    const Binding & getStreamInput(const parabix::StreamSetBuffer * const buffer) const {
151        for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
152            if (mStreamSetInputBuffers[i] == buffer) {
153                return getStreamInput(i);
154            }
155        }
156        throw std::runtime_error("no output binding found given buffer");
157    }
158
159    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
160        assert (i < mStreamSetOutputBuffers.size());
161        assert (mStreamSetOutputBuffers[i]);
162        return mStreamSetOutputBuffers[i];
163    }
164
165    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
166        const auto port = getStreamPort(name);
167        assert (port.first == Port::Output);
168        return getStreamSetOutputBuffer(port.second);
169    }
170
171    const Binding & getStreamOutput(const unsigned i) const {
172        return KernelInterface::getStreamOutput(i);
173    }
174
175    const Binding & getStreamOutput(const parabix::StreamSetBuffer * const buffer) const {
176        for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
177            if (mStreamSetOutputBuffers[i] == buffer) {
178                return getStreamOutput(i);
179            }
180        }
181        throw std::runtime_error("no output binding found given buffer");
182    }
183
184    const Binding & getStreamOutput(const std::string & name) const {
185        const auto port = getStreamPort(name);
186        assert (port.first == Port::Output);
187        return KernelInterface::getStreamOutput(port.second);
188    }
189   
190    // Kernels typically perform block-at-a-time processing, but some kernels may require
191    // a different stride.   In the case of multiblock kernels, the stride attribute
192    // determines the number of minimum number of items that will be provided to the kernel
193    // on each doMultiBlock call.
194    //
195   
196    unsigned getStride() const { return mStride; }
197   
198    virtual ~Kernel() = 0;
199
200    void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
201
202    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb);
203
204    std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
205
206protected:
207
208    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
209
210    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
211
212    // Constructor
213    Kernel(std::string && kernelName, Bindings && stream_inputs,
214          Bindings && stream_outputs,
215          Bindings && scalar_parameters,
216          Bindings && scalar_outputs,
217          Bindings && internal_scalars);
218
219    llvm::Value * getPrincipalItemCount() const {
220        return mAvailablePrincipalItemCount;
221    }
222
223    unsigned getScalarIndex(const std::string & name) const;
224
225    void prepareStreamSetNameMap();
226   
227    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
228
229    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
230   
231    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
232
233    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) { }
234
235    // Add an additional scalar field to the KernelState struct.
236    // Must occur before any call to addKernelDeclarations or createKernelModule.
237    unsigned addScalar(llvm::Type * type, const std::string & name);
238
239    unsigned addUnnamedScalar(llvm::Type * type);
240
241    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & idb);
242
243    void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb);
244
245    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
246
247    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
248        unsigned index; Port port;
249        std::tie(port, index) = getStreamPort(name);
250        if (port == Port::Input) {
251            assert (index < mStreamSetInputBuffers.size());
252            assert (mStreamSetInputBuffers[index]);
253            return mStreamSetInputBuffers[index];
254        } else {
255            assert (index < mStreamSetOutputBuffers.size());
256            assert (mStreamSetOutputBuffers[index]);
257            return mStreamSetOutputBuffers[index];
258        }
259    }
260
261    void setStride(unsigned stride) { mStride = stride; }
262
263private:
264
265    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
266
267    llvm::Value * getStreamSetInputAddress(const std::string & name) const {
268        const Kernel::StreamPort p = getStreamPort(name);
269        assert (p.first == Port::Input);
270        return mStreamSetInputBaseAddress[p.second];
271    }
272
273    llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
274        const Kernel::StreamPort p = getStreamPort(name);
275        assert (p.first == Port::Output);
276        return mStreamSetOutputBaseAddress[p.second];
277    }
278
279    llvm::Value * getAvailableItemCount(const unsigned i) const {
280        return mAvailableItemCount[i];
281    }
282
283    void normalizeStreamProcessingRates();
284
285    bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
286
287protected:
288
289    llvm::Function *                    mCurrentMethod;
290    llvm::Value *                       mAvailablePrincipalItemCount;
291    bool                                mIsGenerated;
292    unsigned                            mStride;
293    llvm::Value *                       mIsFinal;
294    llvm::Value *                       mOutputScalarResult;
295    std::vector<llvm::Value *>          mAvailableItemCount;
296
297    KernelFieldMap                      mKernelFieldMap;
298    std::vector<llvm::Type *>           mKernelFields;
299
300    StreamMap                           mStreamMap;
301
302    StreamSetBuffers                    mStreamSetInputBuffers;
303    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
304    StreamSetBuffers                    mStreamSetOutputBuffers;
305    std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
306};
307
308using Kernels = std::vector<Kernel *>;
309
310class SegmentOrientedKernel : public Kernel {
311protected:
312
313    SegmentOrientedKernel(std::string && kernelName,
314                          Bindings && stream_inputs,
315                          Bindings && stream_outputs,
316                          Bindings && scalar_parameters,
317                          Bindings && scalar_outputs,
318                          Bindings && internal_scalars);
319protected:
320
321    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
322
323    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
324
325};
326
327/*
328The Multi-Block Kernel Builder
329------------------------------
330
331The Multi-Block Kernel Builder is designed to simplify the programming of
332efficient kernels with possibly variable and/or nonaligned output, subject to
333exact or MaxRatio processing constraints.   The following restrictions apply.
334
335#.  The input consists of one or more stream sets, the first of which is
336    known as the principal input stream set.
337
338#.  If there is more than one input stream set, the additional stream sets
339    are first classified as having either a derived processing rate or
340    a variable processing rate.   Stream sets with a derived processing rate
341    have a processing rate defined with respect to the input stream set of one
342    of the following types:  FixedRate, Add1 or RoundUp.    Note that stream sets
343    declared without a processing rate attribute have the FixedRate(1) attribute
344    by default and therefore satisfy this constraint.  All other processing rate
345    types are classified as variable rate.
346
347#.  All output stream sets must be declared with processing rate attributes
348    of one of the following types:
349    *  FixedRate, Add1, Roundup, or MaxRatio with respect to the principal input stream set.
350    *  FixedRate with respect to some other output stream set.
351
352    When using the Multi-Block Kernel Builder to program a new type of kernel,
353    the programmer must implement the generateDoMultiBlockMethod for normal
354    multi-block processing according to the requirements below, as well as
355    providing for special final block processing, if necessary.
356
357#.  The doMultiBlockMethod will be called with the following parameters:
358    * the number of items of the principal input stream to process (itemsToDo),
359    * additional items available parameters for each additional input stream set
360      that is classified as a variable rate stream set
361    * pointers to linear contiguous buffer areas for each of the input stream sets, and
362    * pointers to linear contiguous output buffer areas for each of the output stream sets.
363
364    Notes:
365    * if the kernel has a Lookahead dependency declared on any input stream set, then
366      there will be two buffer pointers for that stream set, one for accessing stream set
367      items without lookahead and one for accessing the items with lookahead.   
368    * pointers are to the beginning of the block corresponding to the
369      processedItemCount or producedItemCount of the given stream set.
370    * the base type of each pointer is the StreamSetBlockType of that streamset
371
372#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
373    processed under the following simplifying assumptions.
374    * the number of itemsToDo will either be an exact multiple of the kernel stride,
375      or, for processing the final block, a value less than the kernel stride
376    * the input buffer of the principal stream set and all input buffers of stream sets
377      with derived processing rates will be safe to access and have data available in
378      accord with their processing rates based on the given number of itemsToDo
379      of the principal input stream set; no further bounds checking is needed. 
380    * input buffers of stream sets with MaxRatio attributes will be safe to access,
381      but will only have valid data as specified by the available items parameter for
382      that stream set.
383    * the kernel programmer is responsible for safe access and bounds checking for any
384      input stream set classified as Unknown rate.   No temporary buffers are used
385      for such stream sets.
386    * all output buffers will be safe to access and have space available
387      for the given maximum output generation rates based on the given number
388      of itemsToDo of the principal input stream set; no further bounds checking
389      is needed.
390    * for final block processing, all input buffers will be extended to be safely
391      treated as containing data corresponding to a full block of the principal
392      input stream set, with the actual data in each buffer padded with null values
393      beyond the end of input.  Similarly, all output buffers will contain space
394      sufficient for the maximum output that can be generated for a full block of
395      input processing.
396    * input and output pointers will be typed to allow convenient and logical access
397      to corresponding streams based on their declared stream set type and processing rate.
398    * for any input pointer p, a GEP instruction with a single int32 index i
399      will produce a pointer to the buffer position corresponding to the ith block of the
400      input stream set, relative to the initial block based on the processedItemCount.
401    * for any output stream set declared with a Fixed or Add1 processing rate with respect
402      to the principal input stream set, a GEP instruction with a single int32 index i
403      will produce a pointer to the buffer position corresponding to the ith block of the
404      stream set, relative to the initial block based on the producedItemCount.
405
406#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
407    processed and produced item counts are updated for all stream sets that have exact
408    processing rate attributes.   Programmers are responsible for updating the counts
409    of any stream set declared with a variable attribute (MaxRatio or Unknown).
410
411#.  An important caveat is that buffer areas may change arbitrarily between
412    calls to the doMultiBlockMethod.   In no case should a kernel store a
413    buffer pointer in its internal state.   Furthermore a kernel must not make
414    any assumptions about the accessibility of stream set data outside of the
415    processing range outside of the block boundaries associated with the given itemsToDo.
416*/
417
418class MultiBlockKernel : public Kernel {
419protected:
420
421    MultiBlockKernel(std::string && kernelName,
422                     Bindings && stream_inputs,
423                     Bindings && stream_outputs,
424                     Bindings && scalar_parameters,
425                     Bindings && scalar_outputs,
426                     Bindings && internal_scalars);
427
428    // Each multi-block kernel subtype must provide its own logic for handling
429    // doMultiBlock calls, subject to the requirements laid out above.
430    // The generateMultiBlockLogic must be written to generate this logic, given
431    // a created but empty function.  Upon entry to generateMultiBlockLogic,
432    // the builder insertion point will be set to the entry block; upone
433    // exit the RetVoid instruction will be added to complete the method.
434    //
435    virtual llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
436
437private:
438
439    // Given a kernel subtype with an appropriate interface, the generateDoSegment
440    // method of the multi-block kernel builder makes all the necessary arrangements
441    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
442    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
443
444    unsigned getItemAlignment(const Binding & binding) const;
445
446    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
447
448    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
449
450    bool requiresCopyBack(const ProcessingRate & rate) const;
451
452    void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
453
454protected:
455
456    std::vector<llvm::Value *>      mInitialAvailableItemCount;
457    std::vector<llvm::Value *>      mInitialProcessedItemCount;
458    std::vector<llvm::Value *>      mInitialProducedItemCount;
459
460};
461
462
463class BlockOrientedKernel : public MultiBlockKernel {
464protected:
465
466    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
467
468    // Each kernel builder subtype must provide its own logic for generating
469    // doBlock calls.
470    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
471
472    // Each kernel builder subtypre must also specify the logic for processing the
473    // final block of stream data, if there is any special processing required
474    // beyond simply calling the doBlock function.   In the case that the final block
475    // processing may be trivially implemented by dispatching to the doBlock method
476    // without additional preparation, the default generateFinalBlockMethod need
477    // not be overridden.
478
479    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
480
481    BlockOrientedKernel(std::string && kernelName,
482                        Bindings && stream_inputs,
483                        Bindings && stream_outputs,
484                        Bindings && scalar_parameters,
485                        Bindings && scalar_outputs,
486                        Bindings && internal_scalars);
487
488private:
489
490    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
491
492    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
493
494    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
495
496    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
497
498private:
499
500    llvm::Function *            mDoBlockMethod;
501    llvm::BasicBlock *          mStrideLoopBody;
502    llvm::IndirectBrInst *      mStrideLoopBranch;
503    llvm::PHINode *             mStrideLoopTarget;
504    llvm::PHINode *             mStrideBlockIndex;
505};
506
507}
508
509#endif
Note: See TracBrowser for help on using the repository browser.