source: icGREP/icgrep-devel/performance/Regxp.cl.template

Last change on this file was 4126, checked in by linmengl, 5 years ago

initial checkin of performance script; based on Ben's script, will do correctness check first and record performance data in json format. A python script will be developed to visualize performance difference.

File size: 7.9 KB
Line 
1#pragma OPENCL EXTENSION cl_amd_printf:enable
2
3#define WORK_GROUP_SIZE 64
4#define NUMBER_OF_GROUP 64
5#define OVERLAP 0
6#define BUFFER_ELEMENTS 10000
7#define BitBlock ulong
8#define CARRY_BIT_MASK 0x8000000000000000
9
10/******************Library Functions 64-bit********************/
11#define simd_or(b1, b2) (b1 | b2)
12#define simd_and(b1, b2) (b1 & b2)
13#define simd_xor(b1, b2) (b1 ^ b2)
14#define simd_andc(b1, b2) (b1 & (~b2))
15#define simd_not(b) (~b)
16#define simd_nor(a,b) (simd_not(simd_or(a,b)))
17
18#define simd_himask_2 0xAAAAAAAAAAAAAAAA
19#define simd_himask_4 0xCCCCCCCCCCCCCCCC
20#define simd_himask_8 0xF0F0F0F0F0F0F0F0
21/* Little-endian */
22#define simd_himask_16 0xFF00FF00FF00FF00
23#define simd_himask_32 0xFFFF0000FFFF0000
24
25#define bitblock_has_bit(v) (v!=0)
26
27#define sisd_slli(r, shft) (r<<shft)
28#define sisd_srli(r, shft) (r>>shft)
29
30#define sisd_from_int(n) n
31
32#define simd_const_1(x)\
33  ((x)==0 ? 0 : 0xFFFFFFFFFFFFFFFF)
34
35#define simd_if(cond, then_val, else_val) \
36  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
37
38#define simd_pack_16_hh(b, a)\
39(a & 0xFF00000000000000)|((a & 0x0000FF0000000000)<<8)|((a & 0X00000000FF000000)<<16)|((a & 0X000000000000FF00)<<24)|\
40((b & 0xFF00000000000000)>>32)|((b & 0x0000FF0000000000)>>24)|((b & 0X00000000FF000000)>>16)|((b & 0X000000000000FF00)>>8)
41
42#define simd_pack_16_ll(b, a)\
43((a & 0x00FF000000000000)<<8)|((a & 0x000000FF00000000)<<16)|((a & 0X0000000000FF0000)<<24)|((a & 0X00000000000000FF)<<32)|\
44((b & 0x00FF000000000000)>>24)|((b & 0x000000FF00000000)>>16)|((b & 0X0000000000FF0000)>>8)|(b & 0X00000000000000FF)
45
46#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
47  do {\
48        BitBlock t0,t1;\
49        t0 = simd_pack_16_hh(s0, s1);\
50        t1 = simd_pack_16_ll(s0, s1);\
51        p0 = simd_if(hi_mask, t0, (t1 >> shift));\
52        p1 = simd_if(hi_mask, (t0 << shift), t1);\
53  } while(0)
54
55#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
56  do {\
57        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
58        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
59        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
60        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
61        s2p_step(s0,s1,simd_himask_2,1,bit00224466_0,bit11335577_0);\
62        s2p_step(s2,s3,simd_himask_2,1,bit00224466_1,bit11335577_1);\
63        s2p_step(s4,s5,simd_himask_2,1,bit00224466_2,bit11335577_2);\
64        s2p_step(s6,s7,simd_himask_2,1,bit00224466_3,bit11335577_3);\
65        s2p_step(bit00224466_0,bit00224466_1,simd_himask_4,2,bit00004444_0,bit22226666_0);\
66        s2p_step(bit00224466_2,bit00224466_3,simd_himask_4,2,bit00004444_1,bit22226666_1);\
67        s2p_step(bit11335577_0,bit11335577_1,simd_himask_4,2,bit11115555_0,bit33337777_0);\
68        s2p_step(bit11335577_2,bit11335577_3,simd_himask_4,2,bit11115555_1,bit33337777_1);\
69        s2p_step(bit00004444_0,bit00004444_1,simd_himask_8,4,p0,p4);\
70        s2p_step(bit11115555_0,bit11115555_1,simd_himask_8,4,p1,p5);\
71        s2p_step(bit22226666_0,bit22226666_1,simd_himask_8,4,p2,p6);\
72        s2p_step(bit33337777_0,bit33337777_1,simd_himask_8,4,p3,p7);\
73  } while(0)
74
75inline BitBlock pablo_advance(int idx, BitBlock a, __local BitBlock *carry, BitBlock *group_carry, const int carryno){
76        carry[0] = group_carry[carryno];
77        BitBlock c = a<<1;
78        carry[idx+1] = (a & CARRY_BIT_MASK)>>63;
79        barrier(CLK_LOCAL_MEM_FENCE);
80        group_carry[carryno] = carry[WORK_GROUP_SIZE];
81        c = c | carry[idx];
82        return c;
83}
84
85inline BitBlock pablo_or(int idx, BitBlock a, __local BitBlock *local_carry){
86        local_carry[idx] = a;
87        barrier(CLK_LOCAL_MEM_FENCE);
88        for(int offset=WORK_GROUP_SIZE/2; offset>0; offset=offset>>1){
89                local_carry[idx] = local_carry[idx]|local_carry[idx^offset];
90                barrier(CLK_LOCAL_MEM_FENCE);
91        }
92        return local_carry[0];
93}
94
95inline BitBlock adc(int idx, BitBlock a, BitBlock b, __local BitBlock *carry, __local BitBlock *bubble, BitBlock *group_carry, const int carryno){
96        BitBlock carry_mask;
97        BitBlock bubble_mask;
98
99        BitBlock partial_sum = a+b;
100        BitBlock gen = a&b;
101        BitBlock prop = a^b;
102        carry[idx] = ((gen | (prop & ~partial_sum))&CARRY_BIT_MASK)>>(WORK_GROUP_SIZE-1-idx);
103        bubble[idx] = (partial_sum + 1)? 0:(((BitBlock)1)<<idx);
104       
105        barrier(CLK_LOCAL_MEM_FENCE);
106        for(int offset=WORK_GROUP_SIZE/2; offset>0; offset=offset>>1){
107                carry[idx] = carry[idx]|carry[idx^offset];
108                bubble[idx] = bubble[idx]|bubble[idx^offset];
109                barrier(CLK_LOCAL_MEM_FENCE);
110        }
111       
112        carry_mask = (carry[0]<<1)|group_carry[carryno];
113        bubble_mask = bubble[0];
114       
115        BitBlock s = (carry_mask + bubble_mask) & ~bubble_mask;
116        BitBlock inc = s | (s-carry_mask);
117        BitBlock rslt = partial_sum + ((inc>>idx)&0x1);
118        group_carry[carryno] = (carry[0]|(bubble_mask & inc))>>63;
119        return rslt;
120}
121
122inline BitBlock scanthru(int idx, BitBlock markers, BitBlock charclass, __local BitBlock *carry, __local BitBlock *bubble, BitBlock *group_carry, const int carryno) {
123        return simd_andc(adc(idx, markers, charclass, carry, bubble, group_carry, carryno), charclass);
124}
125
126inline BitBlock scanto(int idx, BitBlock markers, BitBlock charclass, __local BitBlock *carry, __local BitBlock *bubble, BitBlock *group_carry, const int carryno) {
127        return simd_and(adc(idx, markers, simd_not(charclass), carry, bubble, group_carry, carryno), charclass);
128}
129
130inline BitBlock pablo_blk_MatchStar(int idx, BitBlock marker, BitBlock charclass, __local BitBlock *carry, __local BitBlock *bubble, BitBlock *group_carry, const int carryno) {
131        BitBlock sum = adc(idx, simd_and(charclass, marker), charclass, carry, bubble, group_carry, carryno);
132        return simd_or(simd_xor(sum, charclass), marker);
133}
134
135inline void CarryInit(BitBlock * carry, int count){
136    for (int j=0; j < count; j++)
137         carry[j] = 0;
138}
139
140inline int CarryTest(BitBlock * cq, const int carryno, const int carry_count) {
141  BitBlock c1 = cq[carryno];
142  for (int i = carryno + 1; i < carryno + carry_count; i++) {
143    c1 = c1 | cq[i];
144  }
145  return bitblock_has_bit(c1);
146}
147
148static inline int popcount32(unsigned int x)
149{
150     x -= (x >> 1) & 0x55555555;
151     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
152     x = (x + (x >> 4)) & 0x0F0F0F0F;
153     x += x >> 8;
154     return (x + (x >> 16)) & 0x3F;
155}
156
157static inline int popcount64(BitBlock x)
158{
159         x -= (x >> 1) & 0x5555555555555555;
160     x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
161     x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F;
162     x += x >> 8;
163         x += x >> 16;
164     return (x + (x >> 32)) & 0x7F;   
165}
166
167/**************App Structs***************/
168  struct Basis_bits {
169  BitBlock bit_0;
170  BitBlock bit_1;
171  BitBlock bit_2;
172  BitBlock bit_3;
173  BitBlock bit_4;
174  BitBlock bit_5;
175  BitBlock bit_6;
176  BitBlock bit_7;
177};
178
179  struct Lex {
180{{{@lex_definition}}}
181};
182  struct Output {
183  BitBlock matches;
184};
185
186
187#define classify_bytes_do_block(basis_bits, lex)\
188do {\
189{{{@classify_bytes_do_block}}}
190} while (0)
191 
192#define demo_do_block(idx, lex, output, carry, bubble, group_carry)\
193do {\
194{{{@demo_do_block}}}
195} while (0)
196
197__kernel
198void RegxpMatch(__global BitBlock *s,
199                   __global BitBlock *match_pos,
200                        __local BitBlock *carry,
201                        __local BitBlock *bubble) {
202
203        int lid = get_local_id(0);
204        int gid = get_global_id(0);
205        int group = gid/WORK_GROUP_SIZE;
206        struct Basis_bits basis_bits;
207        struct Lex lex;
208        struct Output output;
209        int match_count = 0;
210        BitBlock regxp_match_carryQ[50];
211        CarryInit(regxp_match_carryQ, 50);
212
213        int buffer_start = (BUFFER_ELEMENTS-OVERLAP)*group+lid;
214        for(int idx = buffer_start; idx < buffer_start + BUFFER_ELEMENTS; idx=idx+WORK_GROUP_SIZE){
215                s2p_bytepack(s[idx*8], s[idx*8+1], s[idx*8+2], s[idx*8+3], s[idx*8+4], s[idx*8+5], s[idx*8+6], s[idx*8+7],
216                basis_bits.bit_0, basis_bits.bit_1, basis_bits.bit_2, basis_bits.bit_3,
217                basis_bits.bit_4, basis_bits.bit_5, basis_bits.bit_6, basis_bits.bit_7);
218               
219                classify_bytes_do_block(basis_bits, lex);
220                demo_do_block(lid, lex, output, carry, bubble, regxp_match_carryQ);
221
222                match_count += popcount64(output.matches);
223        }
224        match_pos[gid] = match_count;
225}
226
227       
Note: See TracBrowser for help on using the repository browser.