source: icGREP/icgrep-devel/icgrep/kernels/attributes.h @ 6161

Last change on this file since 6161 was 6047, checked in by nmedfort, 17 months ago

Major refactoring of buffer types. Static buffers replace Circular and CircularCopyback?. External buffers unify Source/External?.

File size: 18.6 KB
Line 
1#ifndef ATTRIBUTES_H
2#define ATTRIBUTES_H
3
4#include <vector>
5#include <llvm/Support/Compiler.h>
6#include <assert.h>
7
8namespace kernel {
9
10struct Attribute {
11
12    enum class KindId {
13
14        /** INPUT STREAM ATTRIBUTES **/
15
16        LookAhead, /// NOT DONE
17
18        // A LookAhead(n) attribute on an input stream set S declares that the kernel
19        // looks ahead n positions in the input stream.  That is, processing of item
20        // S[i, j] may be defined in terms of S[i, j+n].
21
22        // Guarantee required: the pipeline compiler must ensure that, when
23        // the kernel is called with an available item count of N for the
24        // input stream, that the corresponding input data buffer has
25        // valid items in the buffer up to and including position N+n,
26        // and that the blocks containing the items from N-1 through N+n are
27        // linearly contiguous.  When the final doMultiBlock call is made,
28        // the pipeline compiler must ensure that the n items at lookahead
29        // positions are zero-initialized.
30
31        // (Note: this guarantee anticipates using a single buffer pointer
32        // for both normal stream access and lookahead access.   This avoids
33        // the cost of extra parameters per doMultiBlock call, but requires
34        // that the corresponding circular buffer have a "lookahead extension area"
35        // that holds a copy of the data at the physical start of buffer).
36
37        LookBehind, /// NOT DONE
38
39        // A LookBehind(n) attribute on an input stream S declares that the kernel
40        // requires access to input items up to n positions prior to the current
41        // processed item position.
42
43        // (Notes: this may allow more efficient advances within n (without saving state).
44        // However, a lookbehind extension area prior to the normal buffer base address
45        // is necessary, which must be initially zero-filled.)
46
47        // A LookBehind(n) attribute on an output stream S declares that the kernel
48        // requires access to up to n previously generated output items.
49        // (Example: lz4d lookbehind(65536)).
50
51        Principal,
52
53        // One input stream can be declared as the principal input buffer for a kernel.
54        // If a kernel has a principal input stream, when processing the final stride,
55        // a MultiBlockKernel assumes the item count of the principle is the correct
56        // one and zero extends / truncates all other input streams to match it.
57
58        Deferred,
59
60        // Normally, the processed item count of fixed rate streams is automatically
61        // updated by the MultiBlock kernel. However, some streams behave like Fixed
62        // rate streams (in that they will always eventually process a Fixed amount of
63        // data) but the kernel processes the data in unpredictable chunks. Rather than
64        // declaring those as Unknown or Bounded rates, marking their rate calculation
65        // as Deferred provides the pipeline with a stronger guarantee when it comes to
66        // buffer size calculations.
67
68        ZeroExtend, /// NOT DONE
69
70        // If the available item count of an input stream it less than some other input
71        // stream(s), it will be zero-extended to the length of the larger stream. If
72        // this option is not set and the kernel does not have a MustExplicitlyTerminate
73        // attribute, it will end once any input has been exhausted.
74
75        IndependentRegionBegin, IndependentRegionEnd, /// NOT DONE
76
77        // Some kernels can divide their processing into concrete non-overlapping regions
78        // between a beginning and ending position. This is a hard guarantee that regardless
79        // of the computations between the start of the stream and the beginning of the first
80        // independent region or between the *beginning* of any two independent regions, A,
81        // B, the calculations that occur prior to the beginning of B do not affect the
82        // calculations after it --- even if A is started at an arbitrary position with a
83        // zeroed-out kernel state.
84
85        // If a kernel K is processed simultaneously by two threads, K_0 and K_1, and K_1 is
86        // waiting K_0 to finish and update it's kernel state for K_1 to resume at, K_1 can
87        // compute what its state will be and begin processing before K_0 is finished. This
88        // requires a the pipeline to intervene and call an optimized "output-less" instance
89        // of the kernel prior to calling B.
90
91        ConditionalRegionBegin, ConditionalRegionEnd, /// NOT DONE
92
93        // Some kernels have clearly demarcated regions in which a MultiBlock kernel will
94        // produce useful outputs for only the inputs within those regions. This attribute
95        // instructs the kernel to "zero-fill" the output of any non-selected regions,
96        // skipping strides entirely whenever possible.
97
98        // If the same regions are also independent, we can avoid the overhead of "masking
99        // out" the input streams. Otherwise a MultiBlock will use temporary buffers for all
100        // uses of the streams and zero out any non-regions from the data.
101
102        AlwaysConsume,
103
104        // Always consume the input (i.e., use the lowerbound to determine whether to there
105        // is enough data to execute a stride rather than the upper bound.)
106
107        /** OUTPUT STREAM ATTRIBUTES **/
108
109        Add,
110
111        // An Add(K) attribute states that K items will be added to this stream after
112        // processing the final block.
113
114        RoundUpTo,
115
116        // A RoundUpTo(k) attribute indicates the final item count of this stream will
117        // be rounded up to the nearest multiple of k
118
119        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
120
121        Misaligned,
122
123        // Assume that we cannot statically compute the alignment of this stream set and
124        // perform any operations accordingly
125
126        BlockSize,
127
128        // Typically a kernel assumes that each stream of a stream set is a linear sequence
129        // of items. The BlockSize(K) attribute informs the kernel is actually divided into
130        // (BlockWidth / K) elements and each "stream" actually contains K items of the
131        // first stream followed by K elements of the second stream and so on. The notion
132        // of produced/processed item count changes to suite. I.e., when typical kernels
133        // report that they've processed/produced up to the i-th position, it means:
134
135        //                                         v
136        //                 ...|AAAAAAAA AAAAAAAA AA*              |...
137        //                 ...|BBBBBBBB BBBBBBBB BB*              |...
138        //                 ...|CCCCCCCC CCCCCCCC CC*              |...
139        //                 ...|DDDDDDDD DDDDDDDD DD*              |...
140
141        // However, if (BlockWidth / K) is 4, the same i-th position above is actually:
142
143        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
144        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
145        //                 ...|AA*     |BB*     |CC*     |DD*     |...
146        //                 ...|        |        |        |        |...
147
148        // (Note: this replaces the concept of swizzling and anticipates that the pipeline
149        // will take on the role of automatically inserting the swizzling code necessary).
150
151        ReverseRegionBegin, ReverseRegionEnd, /// NOT DONE
152
153        // Conceptually, reversing a stream S is simple: {S_1,...,S_n} -> {S_n,...,S_1}.
154        // However, this means all of the input data must be computed and stored prior to
155        // executing this kernel. In practice, this is unnecessary as in the context of
156        // text parsing, we're almost always searching for the true starting position of
157        // something ambigious after we've found its end position in some prior kernel.
158
159
160//        Here is a revised definition of SegmentedReverse:
161
162//        Given a stream of data bits S that is considered to be divided into
163//        segments, and a marker stream S having a one bit at the final position
164//        of each segment, the R = SegmentedReverse(S, M) when
165
166//        R_{i} = S_{l + (h - i)}
167//              where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
168//          and where h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
169//          (l and h are the low and high positions of the segment containing i)
170
171//        This is an invertible operation, so we can apply R to a kernel's input
172//        and then to its output to get a SegmentedReverse version of a kernel
173
174//        A kernel which computes segmented reverse is feasible, but seems complex
175//        to implement, and probably too slow.  I have played around with several
176//        ways of tackling it, no good method yet.
177
178//        If there are multiple segments within a block, we could instead use
179//        the following:
180
181//        BlockSegmentedReverse
182
183//        B_{i} = S_{L + (H - i)}
184//             where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
185//                   h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
186//                   L = l if l div BlockSize < h divBlockSize, otherwise (i div BlockSize) * BlockSize
187//                   H = h if l div BlockSize < h divBlockSize, otherwise L + BlockSize - 1
188
189//        An alternative way of looking at this is to eliminate all but the first
190//        and last marker positions within a block.
191
192//        The main point is that, if we apply B to inputs, perform the kernel
193//        and the apply B to outputs, we get the same result if we applied R
194//        (assuming that the kernel computations do not cross boundaries in M).
195
196//        This will be more efficient to compute, but still involves overhead
197//        for shifting and combining streams.
198
199//        I think it may be better to focus on the ReverseKernel adapter, that
200//        handles the reverse operations for both input and output.   This actually
201//        gives more flexibility, because, in a multiblock scenario, we can process
202//        the longest sequence of blocks such that both the beginning and end blocks
203//        have a one bit.   If there are any interior blocks with one bits, then
204//        they will be handled automatically without special shifting and masking.
205
206//        By the way, in my designs, I am wanting to have a callable Multiblock
207//        function, so that the Multiblock function for a Reversed Kernel just
208//        does a little work before calling the Multiblock function of the base kernel.
209//        That seems to have disappeared in the current system.
210
211
212        RequiresLinearAccess, PermitsNonLinearAccess,
213
214        // Indicates whether all unprocessed / consumed space is safely accessible by the
215        // MultiBlockKernel code. By default, input streams and any output stream in which
216        // we know a priori exactly how much data will be written into the overflow buffer
217        // are opt-out and all others are opt-in. The reason is that writing non-linear
218        // output at a non-Fixed rate be costly to manage. E.g.,
219
220        //                             BUFFER          v   OVERFLOW
221        //                |?????############...........###|#####???|
222        //                                 n           p  k    m
223
224        // Suppose from a given offset p, we write n items but only have space for k items
225        // in the stream set buffer. Assuming we wrote more than one stride, we know that
226        // there are (m - k) items in the overflow but may not know what our value of m is
227        // unless we can derive the relationship between m and n a priori. The problem is
228        // that the kernel will write the second stride's output at the (m - k)-th position
229        // of the 0-th block and but final reported count will be n. We can safely mitigate
230        // this in many ways:
231
232        // (1) when we detect that we could write into the overflow region of the buffer,
233        // we can zero out the memory of both the overflow *and* the 0-th block of the
234        // buffer then combine both by OR-ing the streams and writing them to the 0-th
235        // block. The advantage is we require no extra memory but the disadvantage is that
236        // the kernel is now relies on the pipeline to ensure that whenever we may write
237        // into the overflow that the 0-th block is fully consumed.
238
239        // (2) the overflow region is equal to the size of the buffer (i.e., employ double
240        // buffering.) The advantage of this is the kernel makes no assumptions about the
241        // pipeline itself. The disadvantage is we could have to copy a lot of data if k
242        // is very small and the amount we will copy is variable.
243
244        // (3) use stack allocated temporary buffers. This method has similar advantages /
245        // disadvantages to 2 but trades heap space allocations for stack based ones.
246
247        // (4) force people writing kernels to record the number of items written each
248        // stride. The advantage of this is it would be as cheap as (1) but requires the
249        // kernel writer maintain the current stride index and that the kernel logic has
250        // a natural breakpoint in the algorithm in which to record the number.
251
252        Expandable, /// NOT DONE
253
254        // Indicates that the number of stream sets in this buffer can increase.
255
256        /** KERNEL ATTRIBUTES **/
257
258        SelectMinimumInputLength, /// NOT DONE
259
260        // If a kernel has multiple input streams and their final item count differs,
261        // a MultiBlock kernel will select the *minimum* input item count as it's
262        // principle item length and truncate the streams to fit
263
264        // NOTE: this is the default if a kernel does not have SelectMaximumInputLength
265        // set and no PrincipalInputStream was declared.
266
267        SelectMaximumInputLength, /// NOT DONE
268
269        // If a kernel has multiple input streams and their final item count differs,
270        // a MultiBlock kernel will select the *maximum* input item count as it's
271        // principle item length and zero-extend the streams accordingly.
272
273        CanTerminateEarly,
274
275        // Indicates that this kernel can call setTerminationSignal() to terminate the
276        // kernel prior to processing all of its input streams.
277
278        MustExplicitlyTerminate,
279
280        // This kernel terminates *only* after the programmer calls setTerminationSignal()
281        // and will be called even when there is no new input data when the prior kernels
282        // in the pipeline have also terminated.
283
284        MustProcessAll,
285
286        //Workaround, the kernel will finish only when all of the inputs are consumed
287
288    };
289
290    KindId getKind() const {
291        return mKind;
292    }
293
294    bool isAdd() const {
295        return mKind == KindId::Add;
296    }
297
298    bool isPrincipal() const {
299        return mKind == KindId::Principal;
300    }
301
302    bool isRoundUpTo() const {
303        return mKind == KindId::RoundUpTo;
304    }
305
306    bool isBlockSize() const {
307        return mKind == KindId::BlockSize;
308    }
309
310    unsigned amount() const {
311        return mAmount;
312    }
313
314    void setAmount(const unsigned amount) {
315        mAmount = amount;
316    }
317
318    bool operator == (const Attribute & other) const {
319        return mKind == other.mKind && mAmount == other.mAmount;
320    }
321
322    bool operator != (const Attribute & other) const {
323        return !(*this == other);
324    }
325
326protected:
327
328    friend struct AttributeSet;
329    friend struct Binding;
330    friend Attribute Add1();
331    friend Attribute BlockSize(const unsigned k);
332    friend Attribute Principal();
333    friend Attribute AlwaysConsume();
334    friend Attribute RoundUpTo(const unsigned);
335    friend Attribute LookAhead(const unsigned);
336    friend Attribute LookBehind(const unsigned);
337    friend Attribute Deferred();
338    friend Attribute Misaligned();
339    friend Attribute ConditionalRegionBegin();
340    friend Attribute ConditionalRegionEnd();
341    friend Attribute CanTerminateEarly();
342    friend Attribute MustExplicitlyTerminate();
343    friend Attribute RequiresLinearAccess();
344    friend Attribute PermitsNonLinearAccess();
345
346    Attribute(const KindId kind, const unsigned k) : mKind(kind), mAmount(k) { }
347
348private:
349
350    const KindId    mKind;
351    unsigned        mAmount;
352};
353
354struct AttributeSet : public std::vector<Attribute> {
355
356    using AttributeId = Attribute::KindId;
357
358    const AttributeSet & getAttributes() const {
359        return *this;
360    }
361
362    Attribute & findOrAddAttribute(const AttributeId id) {
363        if (Attribute * const attr = __findAttribute(id)) {
364            return *attr;
365        } else {
366            return addAttribute(Attribute(id, 0));
367        }
368    }
369
370    Attribute & findAttribute(const AttributeId id) const {
371        return *__findAttribute(id);
372    }
373
374    Attribute & addAttribute(Attribute attribute);
375
376    bool LLVM_READNONE hasAttributes() const {
377        return !empty();
378    }
379
380    bool LLVM_READNONE hasAttribute(const AttributeId id) const {
381        return __findAttribute(id) != nullptr;
382    }
383
384    AttributeSet() = default;
385
386    AttributeSet(Attribute && attr) { emplace_back(std::move(attr)); }
387
388    AttributeSet(std::initializer_list<Attribute> attrs) : std::vector<Attribute>(attrs) { }
389
390private:
391
392    Attribute * __findAttribute(const AttributeId id) const;
393
394};
395
396inline Attribute Add1() {
397    return Attribute(Attribute::KindId::Add, 1);
398}
399
400inline Attribute RoundUpTo(const unsigned k) {
401    return Attribute(Attribute::KindId::RoundUpTo, k);
402}
403
404inline Attribute AlwaysConsume() {
405    return Attribute(Attribute::KindId::AlwaysConsume, 0);
406}
407
408inline Attribute Principal() {
409    return Attribute(Attribute::KindId::Principal, 0);
410}
411
412inline Attribute LookAhead(const unsigned k) {
413    return Attribute(Attribute::KindId::LookAhead, k);
414}
415
416inline Attribute LookBehind(const unsigned k) {
417    return Attribute(Attribute::KindId::LookBehind, k);
418}
419
420inline Attribute Deferred() {
421    return Attribute(Attribute::KindId::Deferred, 0);
422}
423
424inline Attribute BlockSize(const unsigned k) {
425    assert (k && ((k & (k - 1)) == 0));
426    return Attribute(Attribute::KindId::BlockSize, k);
427}
428
429inline Attribute Swizzled() {
430    return BlockSize(64);
431}
432
433inline Attribute RequiresLinearAccess() {
434    return Attribute(Attribute::KindId::RequiresLinearAccess, 0);
435}
436
437inline Attribute PermitsNonLinearAccess() {
438    return Attribute(Attribute::KindId::PermitsNonLinearAccess, 0);
439}
440
441inline Attribute Misaligned() {
442    return Attribute(Attribute::KindId::Misaligned, 0);
443}
444
445inline Attribute ConditionalRegionBegin() {
446    return Attribute(Attribute::KindId::ConditionalRegionBegin, 0);
447}
448
449inline Attribute ConditionalRegionEnd() {
450    return Attribute(Attribute::KindId::ConditionalRegionEnd, 0);
451}
452
453inline Attribute CanTerminateEarly() {
454    return Attribute(Attribute::KindId::CanTerminateEarly, 0);
455}
456
457inline Attribute MustExplicitlyTerminate() {
458    return Attribute(Attribute::KindId::MustExplicitlyTerminate, 0);
459}
460
461}
462#endif // ATTRIBUTES_H
Note: See TracBrowser for help on using the repository browser.