source: icGREP/icgrep-devel/icgrep/kernels/attributes.h @ 6034

Last change on this file since 6034 was 5985, checked in by nmedfort, 17 months ago

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File size: 18.5 KB
Line 
1#ifndef ATTRIBUTES_H
2#define ATTRIBUTES_H
3
4#include <vector>
5#include <llvm/Support/Compiler.h>
6#include <assert.h>
7
8namespace kernel {
9
10struct Attribute {
11
12    enum class KindId {
13
14        /** INPUT STREAM ATTRIBUTES **/
15
16        LookAhead, /// NOT DONE
17
18        // A LookAhead(n) attribute on an input stream set S declares that the kernel
19        // looks ahead n positions in the input stream.  That is, processing of item
20        // S[i, j] may be defined in terms of S[i, j+n].
21
22        // Guarantee required: the pipeline compiler must ensure that, when
23        // the kernel is called with an available item count of N for the
24        // input stream, that the corresponding input data buffer has
25        // valid items in the buffer up to and including position N+n,
26        // and that the blocks containing the items from N-1 through N+n are
27        // linearly contiguous.  When the final doMultiBlock call is made,
28        // the pipeline compiler must ensure that the n items at lookahead
29        // positions are zero-initialized.
30
31        // (Note: this guarantee anticipates using a single buffer pointer
32        // for both normal stream access and lookahead access.   This avoids
33        // the cost of extra parameters per doMultiBlock call, but requires
34        // that the corresponding circular buffer have a "lookahead extension area"
35        // that holds a copy of the data at the physical start of buffer).
36
37        LookBehind, /// NOT DONE
38
39        // A LookBehind(n) attribute on an input stream S declares that the kernel
40        // requires access to input items up to n positions prior to the current
41        // processed item position.
42
43        // (Notes: this may allow more efficient advances within n (without saving state).
44        // However, a lookbehind extension area prior to the normal buffer base address
45        // is necessary, which must be initially zero-filled.)
46
47        // A LookBehind(n) attribute on an output stream S declares that the kernel
48        // requires access to up to n previously generated output items.
49        // (Example: lz4d lookbehind(65536)).
50
51        Principal,
52
53        // One input stream can be declared as the principal input buffer for a kernel.
54        // If a kernel has a principal input stream, when processing the final stride,
55        // a MultiBlockKernel assumes the item count of the principle is the correct
56        // one and zero extends / truncates all other input streams to match it.
57
58        Deferred,
59
60        // Normally, the processed item count of fixed rate streams is automatically
61        // updated by the MultiBlock kernel. However, some streams behave like Fixed
62        // rate streams (in that they will always eventually process a Fixed amount of
63        // data) but the kernel processes the data in unpredictable chunks. Rather than
64        // declaring those as Unknown or Bounded rates, marking their rate calculation
65        // as Deferred provides the pipeline with a stronger guarantee when it comes to
66        // buffer size calculations.
67
68        ZeroExtend, /// NOT DONE
69
70        // If the available item count of an input stream it less than some other input
71        // stream(s), it will be zero-extended to the length of the larger stream. If
72        // this option is not set and the kernel does not have a MustExplicitlyTerminate
73        // attribute, it will end once any input has been exhausted.
74
75        IndependentRegionBegin, IndependentRegionEnd, /// NOT DONE
76
77        // Some kernels can divide their processing into concrete non-overlapping regions
78        // between a beginning and ending position. This is a hard guarantee that regardless
79        // of the computations between the start of the stream and the beginning of the first
80        // independent region or between the *beginning* of any two independent regions, A,
81        // B, the calculations that occur prior to the beginning of B do not affect the
82        // calculations after it --- even if A is started at an arbitrary position with a
83        // zeroed-out kernel state.
84
85        // If a kernel K is processed simultaneously by two threads, K_0 and K_1, and K_1 is
86        // waiting K_0 to finish and update it's kernel state for K_1 to resume at, K_1 can
87        // compute what its state will be and begin processing before K_0 is finished. This
88        // requires a the pipeline to intervene and call an optimized "output-less" instance
89        // of the kernel prior to calling B.
90
91        ConditionalRegionBegin, ConditionalRegionEnd, /// NOT DONE
92
93        // Some kernels have clearly demarcated regions in which a MultiBlock kernel will
94        // produce useful outputs for only the inputs within those regions. This attribute
95        // instructs the kernel to "zero-fill" the output of any non-selected regions,
96        // skipping strides entirely whenever possible.
97
98        // If the same regions are also independent, we can avoid the overhead of "masking
99        // out" the input streams. Otherwise a MultiBlock will use temporary buffers for all
100        // uses of the streams and zero out any non-regions from the data.
101
102        AlwaysConsume,
103
104        // Always consume the input (i.e., use the lowerbound to determine whether to there
105        // is enough data to execute a stride rather than the upper bound.)
106
107        /** OUTPUT STREAM ATTRIBUTES **/
108
109        Add,
110
111        // An Add(K) attribute states that K items will be added to this stream after
112        // processing the final block.
113
114        RoundUpTo,
115
116        // A RoundUpTo(k) attribute indicates the final item count of this stream will
117        // be rounded up to the nearest multiple of k
118
119        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
120
121        Misaligned,
122
123        // Assume that we cannot statically compute the alignment of this stream set and
124        // perform any operations accordingly
125
126        BlockSize,
127
128        // Typically a kernel assumes that each stream of a stream set is a linear sequence
129        // of items. The BlockSize(K) attribute informs the kernel is actually divided into
130        // (BlockWidth / K) elements and each "stream" actually contains K items of the
131        // first stream followed by K elements of the second stream and so on. The notion
132        // of produced/processed item count changes to suite. I.e., when typical kernels
133        // report that they've processed/produced up to the i-th position, it means:
134
135        //                                         v
136        //                 ...|AAAAAAAA AAAAAAAA AA*              |...
137        //                 ...|BBBBBBBB BBBBBBBB BB*              |...
138        //                 ...|CCCCCCCC CCCCCCCC CC*              |...
139        //                 ...|DDDDDDDD DDDDDDDD DD*              |...
140
141        // However, if (BlockWidth / K) is 4, the same i-th position above is actually:
142
143        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
144        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
145        //                 ...|AA*     |BB*     |CC*     |DD*     |...
146        //                 ...|        |        |        |        |...
147
148        // (Note: this replaces the concept of swizzling and anticipates that the pipeline
149        // will take on the role of automatically inserting the swizzling code necessary).
150
151        ReverseRegionBegin, ReverseRegionEnd, /// NOT DONE
152
153        // Conceptually, reversing a stream S is simple: {S_1,...,S_n} -> {S_n,...,S_1}.
154        // However, this means all of the input data must be computed and stored prior to
155        // executing this kernel. In practice, this is unnecessary as in the context of
156        // text parsing, we're almost always searching for the true starting position of
157        // something ambigious after we've found its end position in some prior kernel.
158
159
160//        Here is a revised definition of SegmentedReverse:
161
162//        Given a stream of data bits S that is considered to be divided into
163//        segments, and a marker stream S having a one bit at the final position
164//        of each segment, the R = SegmentedReverse(S, M) when
165
166//        R_{i} = S_{l + (h - i)}
167//              where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
168//          and where h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
169//          (l and h are the low and high positions of the segment containing i)
170
171//        This is an invertible operation, so we can apply R to a kernel's input
172//        and then to its output to get a SegmentedReverse version of a kernel
173
174//        A kernel which computes segmented reverse is feasible, but seems complex
175//        to implement, and probably too slow.  I have played around with several
176//        ways of tackling it, no good method yet.
177
178//        If there are multiple segments within a block, we could instead use
179//        the following:
180
181//        BlockSegmentedReverse
182
183//        B_{i} = S_{L + (H - i)}
184//             where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
185//                   h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
186//                   L = l if l div BlockSize < h divBlockSize, otherwise (i div BlockSize) * BlockSize
187//                   H = h if l div BlockSize < h divBlockSize, otherwise L + BlockSize - 1
188
189//        An alternative way of looking at this is to eliminate all but the first
190//        and last marker positions within a block.
191
192//        The main point is that, if we apply B to inputs, perform the kernel
193//        and the apply B to outputs, we get the same result if we applied R
194//        (assuming that the kernel computations do not cross boundaries in M).
195
196//        This will be more efficient to compute, but still involves overhead
197//        for shifting and combining streams.
198
199//        I think it may be better to focus on the ReverseKernel adapter, that
200//        handles the reverse operations for both input and output.   This actually
201//        gives more flexibility, because, in a multiblock scenario, we can process
202//        the longest sequence of blocks such that both the beginning and end blocks
203//        have a one bit.   If there are any interior blocks with one bits, then
204//        they will be handled automatically without special shifting and masking.
205
206//        By the way, in my designs, I am wanting to have a callable Multiblock
207//        function, so that the Multiblock function for a Reversed Kernel just
208//        does a little work before calling the Multiblock function of the base kernel.
209//        That seems to have disappeared in the current system.
210
211
212        RequiresLinearAccess, PermitsNonLinearAccess,
213
214        // Indicates whether all unprocessed / consumed space is safely accessible by the
215        // MultiBlockKernel code. By default, input streams and any output stream in which
216        // we know a priori exactly how much data will be written into the overflow buffer
217        // are opt-out and all others are opt-in. The reason is that writing non-linear
218        // output at a non-Fixed rate be costly to manage. E.g.,
219
220        //                             BUFFER          v   OVERFLOW
221        //                |?????############...........###|#####???|
222        //                                 n           p  k    m
223
224        // Suppose from a given offset p, we write n items but only have space for k items
225        // in the stream set buffer. Assuming we wrote more than one stride, we know that
226        // there are (m - k) items in the overflow but may not know what our value of m is
227        // unless we can derive the relationship between m and n a priori. The problem is
228        // that the kernel will write the second stride's output at the (m - k)-th position
229        // of the 0-th block and but final reported count will be n. We can safely mitigate
230        // this in many ways:
231
232        // (1) when we detect that we could write into the overflow region of the buffer,
233        // we can zero out the memory of both the overflow *and* the 0-th block of the
234        // buffer then combine both by OR-ing the streams and writing them to the 0-th
235        // block. The advantage is we require no extra memory but the disadvantage is that
236        // the kernel is now relies on the pipeline to ensure that whenever we may write
237        // into the overflow that the 0-th block is fully consumed.
238
239        // (2) the overflow region is equal to the size of the buffer (i.e., employ double
240        // buffering.) The advantage of this is the kernel makes no assumptions about the
241        // pipeline itself. The disadvantage is we could have to copy a lot of data if k
242        // is very small and the amount we will copy is variable.
243
244        // (3) use stack allocated temporary buffers. This method has similar advantages /
245        // disadvantages to 2 but trades heap space allocations for stack based ones.
246
247        // (4) force people writing kernels to record the number of items written each
248        // stride. The advantage of this is it would be as cheap as (1) but requires the
249        // kernel writer maintain the current stride index and that the kernel logic has
250        // a natural breakpoint in the algorithm in which to record the number.
251
252        /** KERNEL ATTRIBUTES **/
253
254        SelectMinimumInputLength, /// NOT DONE
255
256        // If a kernel has multiple input streams and their final item count differs,
257        // a MultiBlock kernel will select the *minimum* input item count as it's
258        // principle item length and truncate the streams to fit
259
260        // NOTE: this is the default if a kernel does not have SelectMaximumInputLength
261        // set and no PrincipalInputStream was declared.
262
263        SelectMaximumInputLength, /// NOT DONE
264
265        // If a kernel has multiple input streams and their final item count differs,
266        // a MultiBlock kernel will select the *maximum* input item count as it's
267        // principle item length and zero-extend the streams accordingly.
268
269        CanTerminateEarly,
270
271        // Indicates that this kernel can call setTerminationSignal() to terminate the
272        // kernel prior to processing all of its input streams.
273
274        MustExplicitlyTerminate,
275
276        // This kernel terminates *only* after the programmer calls setTerminationSignal()
277        // and will be called even when there is no new input data when the prior kernels
278        // in the pipeline have also terminated.
279
280        MustProcessAll,
281
282        //Workaround, the kernel will finish only when all of the inputs are consumed
283
284    };
285
286    KindId getKind() const {
287        return mKind;
288    }
289
290    bool isAdd() const {
291        return mKind == KindId::Add;
292    }
293
294    bool isPrincipal() const {
295        return mKind == KindId::Principal;
296    }
297
298    bool isRoundUpTo() const {
299        return mKind == KindId::RoundUpTo;
300    }
301
302    bool isBlockSize() const {
303        return mKind == KindId::BlockSize;
304    }
305
306    unsigned amount() const {
307        return mAmount;
308    }
309
310    void setAmount(const unsigned amount) {
311        mAmount = amount;
312    }
313
314    bool operator == (const Attribute & other) const {
315        return mKind == other.mKind && mAmount == other.mAmount;
316    }
317
318    bool operator != (const Attribute & other) const {
319        return !(*this == other);
320    }
321
322protected:
323
324    friend struct AttributeSet;
325    friend struct Binding;
326    friend Attribute Add1();
327    friend Attribute BlockSize(const unsigned k);
328    friend Attribute Principal();
329    friend Attribute AlwaysConsume();
330    friend Attribute RoundUpTo(const unsigned);
331    friend Attribute LookAhead(const unsigned);
332    friend Attribute LookBehind(const unsigned);
333    friend Attribute Deferred();
334    friend Attribute Misaligned();
335    friend Attribute ConditionalRegionBegin();
336    friend Attribute ConditionalRegionEnd();
337    friend Attribute CanTerminateEarly();
338    friend Attribute MustExplicitlyTerminate();
339    friend Attribute RequiresLinearAccess();
340    friend Attribute PermitsNonLinearAccess();
341
342    Attribute(const KindId kind, const unsigned k) : mKind(kind), mAmount(k) { }
343
344private:
345
346    const KindId    mKind;
347    unsigned        mAmount;
348};
349
350struct AttributeSet : public std::vector<Attribute> {
351
352    using AttributeId = Attribute::KindId;
353
354    const AttributeSet & getAttributes() const {
355        return *this;
356    }
357
358    Attribute & findOrAddAttribute(const AttributeId id) {
359        if (Attribute * const attr = __findAttribute(id)) {
360            return *attr;
361        } else {
362            return addAttribute(Attribute(id, 0));
363        }
364    }
365
366    Attribute & findAttribute(const AttributeId id) const {
367        return *__findAttribute(id);
368    }
369
370    Attribute & addAttribute(Attribute attribute);
371
372    bool LLVM_READNONE hasAttributes() const {
373        return !empty();
374    }
375
376    bool LLVM_READNONE hasAttribute(const AttributeId id) const {
377        return __findAttribute(id) != nullptr;
378    }
379
380    AttributeSet() = default;
381
382    AttributeSet(Attribute && attr) { emplace_back(std::move(attr)); }
383
384    AttributeSet(std::initializer_list<Attribute> attrs) : std::vector<Attribute>(attrs) { }
385
386private:
387
388    Attribute * __findAttribute(const AttributeId id) const;
389
390};
391
392inline Attribute Add1() {
393    return Attribute(Attribute::KindId::Add, 1);
394}
395
396inline Attribute RoundUpTo(const unsigned k) {
397    return Attribute(Attribute::KindId::RoundUpTo, k);
398}
399
400inline Attribute AlwaysConsume() {
401    return Attribute(Attribute::KindId::AlwaysConsume, 0);
402}
403
404inline Attribute Principal() {
405    return Attribute(Attribute::KindId::Principal, 0);
406}
407
408inline Attribute LookAhead(const unsigned k) {
409    return Attribute(Attribute::KindId::LookAhead, k);
410}
411
412inline Attribute LookBehind(const unsigned k) {
413    return Attribute(Attribute::KindId::LookBehind, k);
414}
415
416inline Attribute Deferred() {
417    return Attribute(Attribute::KindId::Deferred, 0);
418}
419
420inline Attribute BlockSize(const unsigned k) {
421    assert (k && ((k & (k - 1)) == 0));
422    return Attribute(Attribute::KindId::BlockSize, k);
423}
424
425inline Attribute Swizzled() {
426    return BlockSize(64);
427}
428
429inline Attribute RequiresLinearAccess() {
430    return Attribute(Attribute::KindId::RequiresLinearAccess, 0);
431}
432
433inline Attribute PermitsNonLinearAccess() {
434    return Attribute(Attribute::KindId::PermitsNonLinearAccess, 0);
435}
436
437inline Attribute Misaligned() {
438    return Attribute(Attribute::KindId::Misaligned, 0);
439}
440
441inline Attribute ConditionalRegionBegin() {
442    return Attribute(Attribute::KindId::ConditionalRegionBegin, 0);
443}
444
445inline Attribute ConditionalRegionEnd() {
446    return Attribute(Attribute::KindId::ConditionalRegionEnd, 0);
447}
448
449inline Attribute CanTerminateEarly() {
450    return Attribute(Attribute::KindId::CanTerminateEarly, 0);
451}
452
453inline Attribute MustExplicitlyTerminate() {
454    return Attribute(Attribute::KindId::MustExplicitlyTerminate, 0);
455}
456
457}
458#endif // ATTRIBUTES_H
Note: See TracBrowser for help on using the repository browser.