1 | /* |
---|
2 | * Created on: 18-December-2011 |
---|
3 | * Author: Ken Herdy |
---|
4 | * |
---|
5 | * Length partitioned parallel bit stream hashing with chaining. |
---|
6 | * |
---|
7 | */ |
---|
8 | #ifndef HASH_TABLE_HPP |
---|
9 | #define HASH_TABLE_HPP |
---|
10 | |
---|
11 | /*============================================================================= |
---|
12 | allocator.hpp - Coterminal memory pool allocators. |
---|
13 | Created on: 18-December-2011 |
---|
14 | Author: Ken Herdy |
---|
15 | =============================================================================*/ |
---|
16 | |
---|
17 | //#define HASH_TABLE_HPP_DEBUG |
---|
18 | |
---|
19 | #define MIN(a, b) ((a < b) ? a : b) |
---|
20 | #define MAX(a, b) ((a > b) ? a : b) |
---|
21 | #define MAX_TABLE_SIZE 65536 |
---|
22 | |
---|
23 | #include "../lib/bitblock.hpp" |
---|
24 | #include "../lib/byte_compare.hpp" |
---|
25 | #include "../lib/allocator.hpp" |
---|
26 | #include "../lib/s2p.hpp" |
---|
27 | #include "../lib/byte_pool.hpp" |
---|
28 | #include "../lib/hash.hpp" |
---|
29 | #include <cmath> |
---|
30 | #include <stdint.h> |
---|
31 | #include <string> |
---|
32 | #include <sstream> |
---|
33 | #include <iostream> |
---|
34 | using namespace std; |
---|
35 | |
---|
36 | typedef uint32_t gid_type; |
---|
37 | |
---|
38 | typedef struct node { |
---|
39 | uint8_t * raw_bytes; |
---|
40 | uint32_t raw_bytes_lgth; |
---|
41 | uint8_t * h0; |
---|
42 | uint8_t * h1; |
---|
43 | uint32_t hash_bit_lgth; |
---|
44 | gid_type gid; |
---|
45 | node * next; |
---|
46 | } node; |
---|
47 | |
---|
48 | class gid { |
---|
49 | public: |
---|
50 | static uint64_t next() { return value++; } |
---|
51 | private: |
---|
52 | static uint64_t value; |
---|
53 | }; |
---|
54 | |
---|
55 | /* Global GID for all hash_tables */ |
---|
56 | uint64_t gid::value = 1; |
---|
57 | |
---|
58 | /* |
---|
59 | TODO - Move raw_data_pool up into a base class |
---|
60 | that so all hash table implementation share the same memory pool. |
---|
61 | */ |
---|
62 | |
---|
63 | template<class COMPARE_STRATEGY, class HASH_STRATEGY, class ALLOCATOR> |
---|
64 | class hash_table { |
---|
65 | public: |
---|
66 | |
---|
67 | hash_table(const uint32_t table_size=4096, const uint32_t load_factor=2) { |
---|
68 | init(table_size,load_factor); |
---|
69 | } |
---|
70 | |
---|
71 | virtual ~hash_table() { |
---|
72 | destroy_table(this->table, this->table_size); |
---|
73 | } |
---|
74 | |
---|
75 | IDISA_ALWAYS_INLINE uint64_t get_bucket(const uint8_t * h0, const uint8_t * h1, const uint32_t idx, const uint32_t slice_bits) { |
---|
76 | return hash_strategy.hash(h0,h1,idx,slice_bits,hash_size); |
---|
77 | } |
---|
78 | |
---|
79 | IDISA_ALWAYS_INLINE void insert(const uint64_t bucket, uint8_t * raw_bytes, uint32_t raw_byte_lgth, uint8_t * h0, uint8_t * h1, uint32_t hash_bit_lgth, gid_type gid) { |
---|
80 | |
---|
81 | node * next = (node *) malloc(sizeof(node)); |
---|
82 | if (next == NULL) { |
---|
83 | cerr << "Out of Memory" << endl; |
---|
84 | abort(); |
---|
85 | } |
---|
86 | |
---|
87 | |
---|
88 | next->raw_bytes = raw_bytes; |
---|
89 | next->raw_bytes_lgth = raw_byte_lgth; |
---|
90 | next->h0 = h0; |
---|
91 | next->h1 = h1; |
---|
92 | next->hash_bit_lgth = hash_bit_lgth; |
---|
93 | next->gid = gid; |
---|
94 | |
---|
95 | next->next = table[bucket]; |
---|
96 | table[bucket] = next; |
---|
97 | } |
---|
98 | |
---|
99 | IDISA_ALWAYS_INLINE gid_type lookup_or_insert(uint8_t * raw_bytes, const uint32_t idx, const uint32_t raw_byte_lgth, |
---|
100 | uint8_t * h0, uint8_t * h1, const uint32_t hash_bit_lgth) { |
---|
101 | |
---|
102 | uint64_t bucket = get_bucket(h0,h1,idx,hash_bit_lgth); |
---|
103 | |
---|
104 | node * crt = table[bucket]; |
---|
105 | node * prev = crt; |
---|
106 | |
---|
107 | /* lookup */ |
---|
108 | while(NULL != crt) { |
---|
109 | if(compare_strategy.compare(&raw_bytes[idx], crt->raw_bytes, raw_byte_lgth)) { |
---|
110 | return crt->gid; |
---|
111 | } |
---|
112 | prev = crt; |
---|
113 | crt = crt->next; |
---|
114 | } |
---|
115 | |
---|
116 | /* insert */ |
---|
117 | gid_type gid = gid::next(); |
---|
118 | |
---|
119 | uint64_t x0 = bit_slice(h0, idx, hash_bit_lgth); |
---|
120 | uint64_t x1 = bit_slice(h1, idx, hash_bit_lgth); |
---|
121 | |
---|
122 | #ifdef HASH_TABLE_HPP_DEBUG |
---|
123 | cout << "Insert: "; |
---|
124 | cout << "Idx: " << idx << ", "; |
---|
125 | cout << "GID: " << gid << ", "; |
---|
126 | cout << "Val: '" << string((char *) &raw_bytes[idx],0,raw_byte_lgth) << "', "; |
---|
127 | cout << "Bkt: " << bucket << endl; |
---|
128 | #endif |
---|
129 | |
---|
130 | insert( bucket, |
---|
131 | raw_data_pool.insert(&raw_bytes[idx],raw_byte_lgth), |
---|
132 | raw_byte_lgth, |
---|
133 | raw_data_pool.insert((uint8_t *)&x0, bits2bytes(hash_bit_lgth)), |
---|
134 | raw_data_pool.insert((uint8_t *)&x1, bits2bytes(hash_bit_lgth)), |
---|
135 | hash_bit_lgth, |
---|
136 | gid); |
---|
137 | |
---|
138 | if(expansion_test(table[bucket])) { |
---|
139 | expand(); |
---|
140 | } |
---|
141 | // elements++; |
---|
142 | return gid; |
---|
143 | } |
---|
144 | |
---|
145 | void print_table() const { |
---|
146 | |
---|
147 | for(int i=0;i<table_size;i++) { |
---|
148 | |
---|
149 | node * head = table[i]; |
---|
150 | node * crt = head; |
---|
151 | |
---|
152 | if(head != NULL) { cout << "table[" << i << "]"; } |
---|
153 | |
---|
154 | while(crt != NULL) { |
---|
155 | cout << "->" << "(GID:" << crt->gid |
---|
156 | << ", Val:'" << string((char *)&crt->raw_bytes[0], crt->raw_bytes_lgth) |
---|
157 | << "', Len:" << crt->raw_bytes_lgth << ")"; |
---|
158 | crt = crt->next; |
---|
159 | } |
---|
160 | |
---|
161 | if(head != NULL) { cout << endl; } |
---|
162 | } |
---|
163 | } |
---|
164 | |
---|
165 | private: |
---|
166 | // ----- Members ----- |
---|
167 | uint32_t table_size; |
---|
168 | uint32_t hash_size; |
---|
169 | uint32_t load_factor; |
---|
170 | |
---|
171 | node ** table; |
---|
172 | // uint32_t elements; |
---|
173 | byte_pool<ALLOCATOR> raw_data_pool; |
---|
174 | COMPARE_STRATEGY compare_strategy; |
---|
175 | HASH_STRATEGY hash_strategy; |
---|
176 | |
---|
177 | // ----- Helpers ----- |
---|
178 | uint32_t log2msb(uint32_t v) { |
---|
179 | uint32_t r = 0; |
---|
180 | while (v >>= 1) {r++;} |
---|
181 | return r; |
---|
182 | } |
---|
183 | |
---|
184 | void init(const uint32_t size, const uint32_t factor) { |
---|
185 | |
---|
186 | hash_size = MIN((log2msb(size)),hash_strategy.max_hashsize()); |
---|
187 | table_size = pow(2.0, (double)hash_size); |
---|
188 | table = (node**) calloc(table_size, sizeof(node*)); |
---|
189 | |
---|
190 | if ((table) == NULL) { |
---|
191 | cerr << "Out of Memory" << endl; |
---|
192 | abort(); |
---|
193 | } |
---|
194 | // elements = 0; |
---|
195 | load_factor = factor; |
---|
196 | } |
---|
197 | |
---|
198 | void destroy_table(node ** table, const uint32_t table_size) { |
---|
199 | node * crt = NULL; |
---|
200 | node * next = NULL; |
---|
201 | |
---|
202 | for(int i=0;i<table_size;i++) { |
---|
203 | crt = table[i]; |
---|
204 | while(crt != NULL) { |
---|
205 | next = crt->next; |
---|
206 | free((node *)crt); |
---|
207 | crt = next; |
---|
208 | } |
---|
209 | } |
---|
210 | |
---|
211 | free((node **) table); |
---|
212 | table = NULL; |
---|
213 | } |
---|
214 | |
---|
215 | uint32_t chain_lgth(const node * chain) { |
---|
216 | uint32_t lgth = 0; |
---|
217 | node * crt = (node *)chain; |
---|
218 | while(NULL != crt) { |
---|
219 | lgth++; |
---|
220 | crt = crt->next; //? |
---|
221 | } |
---|
222 | return lgth; |
---|
223 | } |
---|
224 | |
---|
225 | bool expansion_test(const node * chain) { |
---|
226 | |
---|
227 | if ((chain_lgth(chain) > load_factor) |
---|
228 | && (hash_size < hash_strategy.max_hashsize()) |
---|
229 | && table_size <= MAX_TABLE_SIZE) { |
---|
230 | return true; |
---|
231 | } |
---|
232 | return false; |
---|
233 | } |
---|
234 | |
---|
235 | void expand() { // naive expansion |
---|
236 | |
---|
237 | node ** prev_table = this->table; |
---|
238 | uint32_t prev_size = this->table_size; |
---|
239 | |
---|
240 | init(table_size*2, load_factor); // TODO - WARNING - CONSIDER MAX MEMORY |
---|
241 | |
---|
242 | uint64_t bucket; |
---|
243 | node * crt; |
---|
244 | |
---|
245 | for(int i=0;i<prev_size;i++) { |
---|
246 | crt = prev_table[i]; |
---|
247 | |
---|
248 | while(crt != NULL) { |
---|
249 | bucket = this->get_bucket(crt->h0, |
---|
250 | crt->h1, |
---|
251 | 0, |
---|
252 | crt->hash_bit_lgth); |
---|
253 | this->insert(bucket, |
---|
254 | crt->raw_bytes, |
---|
255 | crt->raw_bytes_lgth, |
---|
256 | crt->h0, |
---|
257 | crt->h1, |
---|
258 | crt->hash_bit_lgth, |
---|
259 | crt->gid); |
---|
260 | |
---|
261 | crt = crt->next; |
---|
262 | } |
---|
263 | } |
---|
264 | |
---|
265 | destroy_table(prev_table,prev_size); |
---|
266 | crt = NULL; |
---|
267 | |
---|
268 | for(int i=0;i<table_size;i++) { |
---|
269 | if(expansion_test(table[i])) { |
---|
270 | expand(); |
---|
271 | } |
---|
272 | } |
---|
273 | } |
---|
274 | |
---|
275 | }; |
---|
276 | |
---|
277 | /* Length specialized strategy classes. Hash, Compare */ |
---|
278 | |
---|
279 | /* Templated approach: |
---|
280 | (1) Avoids duplicate code, |
---|
281 | (2) Avoid vtable overhead, |
---|
282 | (3) Introduces additional coding complexity. |
---|
283 | */ |
---|
284 | |
---|
285 | /* Hash Strategy */ |
---|
286 | class hash_strategy { |
---|
287 | public: |
---|
288 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits); |
---|
289 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize(); |
---|
290 | protected: |
---|
291 | hash_strategy() {} |
---|
292 | }; |
---|
293 | |
---|
294 | template<uint32_t LGTH> |
---|
295 | class hash_strategy_t: public hash_strategy { |
---|
296 | public: |
---|
297 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits); |
---|
298 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize(); |
---|
299 | }; |
---|
300 | |
---|
301 | /* Specialized 'Byte' hash functions. */ |
---|
302 | template<> class hash_strategy_t<1> { |
---|
303 | public: |
---|
304 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
305 | return h0[idx]; |
---|
306 | } |
---|
307 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(1); } |
---|
308 | }; |
---|
309 | |
---|
310 | template<> class hash_strategy_t<2> { |
---|
311 | public: |
---|
312 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
313 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); // TODO byte_compress_hash |
---|
314 | } |
---|
315 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(2); } |
---|
316 | }; |
---|
317 | |
---|
318 | template<> class hash_strategy_t<3> { |
---|
319 | public: |
---|
320 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
321 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); |
---|
322 | } |
---|
323 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(3); } |
---|
324 | }; |
---|
325 | |
---|
326 | template<> class hash_strategy_t<4>: public hash_strategy { |
---|
327 | public: |
---|
328 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
329 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); |
---|
330 | }; |
---|
331 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(4); } |
---|
332 | }; |
---|
333 | |
---|
334 | template<> class hash_strategy_t<5>: public hash_strategy { |
---|
335 | public: |
---|
336 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
337 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); |
---|
338 | }; |
---|
339 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(5); } |
---|
340 | }; |
---|
341 | |
---|
342 | template<> class hash_strategy_t<6>: public hash_strategy { |
---|
343 | public: |
---|
344 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
345 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); |
---|
346 | }; |
---|
347 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(6); } |
---|
348 | }; |
---|
349 | |
---|
350 | template<> class hash_strategy_t<7>: public hash_strategy { |
---|
351 | public: |
---|
352 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
353 | return bit_compress_hash(h0, h1, bytes2bits(idx), slice_bits, hash_bits); |
---|
354 | }; |
---|
355 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return bytes2bits(7); } |
---|
356 | }; |
---|
357 | |
---|
358 | template<> class hash_strategy_t<8>: public hash_strategy { |
---|
359 | public: |
---|
360 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
361 | return bit_compress_hash(h0, h1, idx, slice_bits, hash_bits); |
---|
362 | }; |
---|
363 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return 8; } |
---|
364 | }; |
---|
365 | |
---|
366 | /* Default */ |
---|
367 | class hash_strategy_d: public hash_strategy { |
---|
368 | public: |
---|
369 | static IDISA_ALWAYS_INLINE uint64_t hash(const uint8_t * h0, const uint8_t * h1, const int32_t idx, const uint32_t slice_bits, const uint32_t hash_bits) { |
---|
370 | return bit_compress_hash(h0, h1, idx, slice_bits, hash_bits); // expect h0 as hash bit stream |
---|
371 | } |
---|
372 | /* WARNING: Min bit count of any default (non-specialized) bit_compress_hash. */ |
---|
373 | static IDISA_ALWAYS_INLINE uint32_t max_hashsize() { return 9; } |
---|
374 | }; |
---|
375 | |
---|
376 | /* Compare Strategy */ |
---|
377 | class compare_strategy { |
---|
378 | public: |
---|
379 | static IDISA_ALWAYS_INLINE bool compare(uint8_t * x, uint8_t * y, const uint32_t lgth=0); |
---|
380 | protected: |
---|
381 | compare_strategy() {} |
---|
382 | }; |
---|
383 | |
---|
384 | /* Length specific comparison */ |
---|
385 | template<class T, uint32_t LGTH> |
---|
386 | class identity_strategy_t: public compare_strategy { |
---|
387 | public: |
---|
388 | static IDISA_ALWAYS_INLINE bool compare(uint8_t * x, uint8_t * y, const uint32_t lgth=0) { |
---|
389 | return overlap_compare<T,LGTH>(x,y); |
---|
390 | } |
---|
391 | }; |
---|
392 | |
---|
393 | /* Default */ |
---|
394 | class identity_strategy_d: public compare_strategy { |
---|
395 | public: |
---|
396 | static IDISA_ALWAYS_INLINE bool compare(uint8_t * x, uint8_t * y, const uint32_t lgth=0) { |
---|
397 | return mem_compare(x,y,lgth); |
---|
398 | } |
---|
399 | }; |
---|
400 | |
---|
401 | #endif // HASH_TABLE_HPP |
---|