source: proto/lz4d/lz4d_e_d.py @ 5520

Last change on this file since 5520 was 5520, checked in by xwa163, 22 months ago

Implement prototype of lz4 decoder using extract and deposit idea

File size: 14.7 KB
Line 
1
2import sys
3from Basis_Bits import Basis_bits
4import pablo
5
6'''
7Bitwise Operation
8'''
9def bit_test(bit, index):
10    return bit >> index & 1
11def bit_clear(bit, index):
12    mask = ~0 - (1 << index);
13    return bit & mask
14
15
16'''
17Methods that should be implemented in Pablo
18'''
19def position(bits) :
20    return len(bin(bits)) - 3
21    # can not use log2 since log2 will cause some inprecise results when the data is large
22    # return int(math.log(bits, 2))
23def singletonStream(bits):
24    return bits - ((bits - 1) & bits)
25def singleton(pos):
26    return 1 << pos
27
28# Advance by variable
29
30
31'''
32Methods that has already been supported
33'''
34def count_forward_zero(bits):
35    if bits == 0 :
36        return -1
37    return position(singletonStream(bits))
38
39def extend_match_length(fromIndex, extender_bits):
40    return count_forward_zero(~(extender_bits >> fromIndex))
41
42'''
43Helper Functions for extract_m0_and_d
44'''
45def min_position(a,b,c,d):
46    for i in [a,b,c,d]:
47        if i > 0:
48            temp = i
49            break
50    for i in [a,b,c,d]:
51        if i > 0 and i < temp:
52            temp = i
53    return temp
54
55'''
56Kernels
57'''
58# Extract the start and end marker bits streams of compressed and uncompressed block.
59# @param file_content
60# @return compressed_start_marker
61#         compressed_end_marker
62#         uncompressed_start_marker
63#         uncompressed_end_marker
64
65def extract_block_marker(file_content):
66    offset = 0
67    compressed_start_marker = 0
68    uncompressed_start_marker = 0
69    endmark_bit_pos = 0
70    compressed_end_marker = 0
71    uncompressed_end_marker = 0
72
73    offset += 4  #Magicc Number
74
75    flag = ord(file_content[offset])
76
77    has_content_checksum = bit_test(flag, 2)
78    has_content_size = bit_test(flag, 3)
79    has_block_checksum = bit_test(flag, 4)
80
81    frame_descriptor_size = 3
82    if has_content_size:
83        frame_descriptor_size = 11
84    offset += frame_descriptor_size
85
86    # Process Data Block
87    while True:
88        blockSize = 0
89        for i in range(4):
90            blockSize += (ord(file_content[offset + i]) << (i * 8))
91            pass
92
93        if blockSize == 0:
94            # End Mark
95            break
96
97        highestIndex = 4 * 8 - 1
98        isCompressed = not bit_test(blockSize, highestIndex)
99        realBlockSize = bit_clear(blockSize, highestIndex)
100
101        offset = offset + 4
102
103        if realBlockSize == 0: #End Mark
104            break
105        elif isCompressed:
106            compressed_start_marker = compressed_start_marker | (1 << offset)
107        else:
108            uncompressed_start_marker = uncompressed_start_marker | (1 << offset)
109
110        offset = offset + realBlockSize
111
112        endmark_bit_pos != singleton(offset)
113        if isCompressed:
114            compressed_end_marker |= singleton(offset)
115        else:
116            uncompressed_end_marker |= singleton(offset)
117
118        if has_block_checksum:
119            offset = offset + 4
120
121    # Ignore Content Check Sum
122    return compressed_start_marker, compressed_end_marker, uncompressed_start_marker, uncompressed_end_marker
123
124
125
126def extract_e1_marker(file_content, basis_bits, compressed_start_marker, compressed_end_marker, extender):
127    # CC_0x0X = (~basis_bits.bit_0) & (~basis_bits.bit_1) & (~basis_bits.bit_2) & (~basis_bits.bit_3)
128
129    CC_0xFX = basis_bits.bit_0 & basis_bits.bit_1 & basis_bits.bit_2 & basis_bits.bit_3
130    CC_0xXF = basis_bits.bit_4 & basis_bits.bit_5 & basis_bits.bit_6 & basis_bits.bit_7
131
132    token_markers = 0
133    offset_marker = 0
134    literal_marker = 0
135
136    e1_marker = 0
137
138    while compressed_start_marker:
139        token_marker = singletonStream(compressed_start_marker)  # May be handle in parallel if Position() support
140        compressed_start_marker = compressed_start_marker - token_marker
141
142        while token_marker:
143            token_markers |= token_marker
144            token_pos = position(token_marker)
145
146            # print(token_pos)
147            # print(ord(file_content[token_pos]))
148
149            extended_literal = token_marker & CC_0xFX
150            extended_match = token_marker & CC_0xXF
151
152            literal_length_end = pablo.ScanThru(token_marker, extended_literal | extender)
153
154            literal_length_end_pos = position(literal_length_end)
155
156            literal_extension_size = literal_length_end_pos - token_pos
157
158            length_base = ord(file_content[literal_length_end_pos])
159
160            # Will be a Select Instruction
161            if literal_extension_size == 0:
162                literal_length = length_base >> 4
163            else:
164                literal_length = (literal_extension_size - 1) * 255 + 15 + length_base
165
166            literal_marker_bit = literal_length_end
167
168            literal_marker_bit = pablo.Advance(literal_marker_bit)
169            literal_marker_end_bit = pablo.AdvancebyPos(literal_marker_bit, literal_length)
170            # If literal_marker_bit and literal_marker_end_bit are the same, there will be no literal part in this sequence
171            real_literal_marker_bit = literal_marker_bit & (~literal_marker_end_bit)  # handle no literal part
172
173            literal_marker |= real_literal_marker_bit  # literal start bit
174            literal_marker |= pablo.AdvancebyPos(real_literal_marker_bit, literal_length)  # literal end bit
175
176            # handle e1 bits
177            new_e1_bits = literal_marker_end_bit - literal_length_end
178            e1_marker |= new_e1_bits
179
180            offset_pos = literal_length_end_pos + literal_length + 1
181            offset_marker |= (singleton(offset_pos) & ~compressed_end_marker)
182
183            c = offset_pos - token_pos + 1
184            match_length_start = pablo.AdvancebyPos(token_marker, c)  # second bit of offset
185            match_length_start = match_length_start & ~(pablo.Advance(compressed_end_marker))  # clear bit before scanThru if reached block end
186
187            match_length_extend_start = pablo.AdvancebyPos(extended_match, offset_pos - token_pos + 1)
188            match_length_end = pablo.ScanThru(match_length_start, match_length_extend_start | extender)
189
190            match_length_end = match_length_end & ~compressed_end_marker
191            # print(compressed_end_marker)
192            token_marker = pablo.Advance(match_length_end)
193    return token_markers, offset_marker, literal_marker, e1_marker
194
195
196
197# TODO may need to use some other way
198def extract_m0_and_d(file_content, literal_marker, token_marker, offset_marker, uncompressed_marker, extender_bits):
199    m0_marker = 0
200    d_uncompressed_marker = 0
201
202    cur_output_pos = 0
203
204    while literal_marker or token_marker or offset_marker or uncompressed_marker:
205        literal_one_pos = count_forward_zero(literal_marker)
206        token_one_pos = count_forward_zero(token_marker)
207        offset_one_pos = count_forward_zero(offset_marker)
208        uncompressed_one_pos = count_forward_zero(uncompressed_marker)
209        min_pos = min_position(literal_one_pos, token_one_pos, offset_one_pos, uncompressed_one_pos)
210
211        if min_pos == literal_one_pos:
212            literal_marker = bit_clear(literal_marker, literal_one_pos)
213            secondLiteralOnePos = count_forward_zero(literal_marker)
214            literal_marker = bit_clear(literal_marker, secondLiteralOnePos)
215            cur_output_pos = cur_output_pos + secondLiteralOnePos - literal_one_pos
216
217        elif min_pos == uncompressed_one_pos:
218            uncompressed_marker = bit_clear(uncompressed_marker, uncompressed_one_pos)
219            uncompressed_end_pos = count_forward_zero(uncompressed_marker)
220            uncompressed_marker = bit_clear(uncompressed_marker, uncompressed_end_pos)
221            new_output_pos = cur_output_pos + uncompressed_end_pos - uncompressed_one_pos
222            d_uncompressed_marker |= (singleton(cur_output_pos) - singleton(new_output_pos))
223            cur_output_pos = new_output_pos
224
225        elif min_pos == token_one_pos:
226            token_marker = bit_clear(token_marker, token_one_pos)
227            tempTokenByte = ord(file_content[token_one_pos])
228        elif min_pos == offset_one_pos:
229            offset_marker = bit_clear(offset_marker, offset_one_pos)
230
231            match_length_base = tempTokenByte & 0xf
232
233            extended_length = extend_match_length(offset_one_pos + 2, extender_bits)
234
235            # Will Be a Select Instruction
236            if match_length_base < 0xf:
237                match_length = match_length_base
238            else:
239                match_length = match_length_base + extended_length * 255 + ord(file_content[offset_one_pos + 2 + extended_length])
240
241            match_length += 4
242
243            m0_cur_end = cur_output_pos + match_length - 1
244            m0_mask = singleton(m0_cur_end) - singleton(cur_output_pos)
245            m0_marker |= m0_mask
246
247            cur_output_pos += match_length
248
249    # return cur_output_pos only in prototype in order to produce enough output string buffer
250    #        it will be represent by the marker length of m0 or d_uncompressed markers
251    return (m0_marker, d_uncompressed_marker, cur_output_pos)
252
253
254def deposit_compressed(file_content, compressed_start_marker, compressed_end_marker, e1_marker, m0_marker, e_uncompressed_marker, outputStrBuffer):
255
256    d_marker = (~(m0_marker | pablo.Advance(m0_marker))) & (~e_uncompressed_marker)
257
258    output_pos = 0
259
260    # Outer loop will iterate throught all of the compressed block
261    while (compressed_start_marker):
262        output_pos_marker = pablo.ScanTo(singleton(output_pos), d_marker)
263        output_pos = position(output_pos_marker)
264
265        compressed_start_bit = singletonStream(compressed_start_marker)
266        token_mark = compressed_start_bit
267        compressed_start_marker = compressed_start_marker - compressed_start_bit
268
269        compressed_end_bit = singletonStream(compressed_end_marker)
270        compressed_end_marker = compressed_end_marker - compressed_end_bit
271        compressed_end_bit_pos = position(compressed_end_bit)
272
273        # token_pos = 1 << compressed_start_pos
274        # token_mark = singleton(token_pos)
275        last_match_len_marker = pablo.ScanTo(token_mark, e1_marker)  # first e1 bit
276        last_match_len_marker_pos = position(last_match_len_marker)
277        literal_end_mark = pablo.ScanThru(last_match_len_marker, e1_marker)
278        literal_end_pos = position(literal_end_mark)
279        literal_length = literal_end_pos - last_match_len_marker_pos - 1
280
281        # Output
282        outputStrBuffer[output_pos: output_pos + literal_length] = list(file_content[last_match_len_marker_pos + 1: last_match_len_marker_pos + 1 + literal_length])
283
284        output_pos += literal_length
285        while literal_end_pos < compressed_end_bit_pos:
286            offset_pos = literal_end_pos
287
288            match_offset = ord(file_content[offset_pos]) + (ord(file_content[offset_pos+ 1]) << 8)
289            match_copy_pos = output_pos - match_offset
290
291            output_mark = singleton(output_pos)
292            match_end_mark = pablo.ScanThru(output_mark, m0_marker)
293
294            match_length = position(match_end_mark) - output_pos + 1
295
296            # Match Copy
297            for i in range(match_length):
298                outputStrBuffer[output_pos + i] = outputStrBuffer[match_copy_pos + i]
299
300            output_pos += match_length
301
302            #  /* next token */
303            last_match_len_marker = pablo.ScanTo(literal_end_mark, e1_marker)
304            last_match_len_marker_pos = position(last_match_len_marker)
305            literal_end_mark = pablo.ScanThru(last_match_len_marker, e1_marker)
306            literal_end_pos = position(literal_end_mark)
307            literal_length = literal_end_pos - position(last_match_len_marker) - 1
308
309            # Literal Copy
310            outputStrBuffer[output_pos: output_pos + literal_length] = list(
311                file_content[last_match_len_marker_pos + 1: last_match_len_marker_pos + 1 + literal_length])
312
313            output_pos += literal_length
314        pass
315
316def deposit_uncompressed(file_content, e_uncompressed_marker, e_uncompressed_end_marker, d_uncompressed_marker, outputStrBuffer):
317    output_pos = 0
318    while e_uncompressed_marker:
319        uncompressed_start_bit = singletonStream(e_uncompressed_marker)
320        uncompressed_start_pos = position(uncompressed_start_bit)
321        e_uncompressed_marker = e_uncompressed_marker - uncompressed_start_bit
322
323        uncompressed_end_bit = singletonStream(e_uncompressed_end_marker)
324        uncompressed_end_pos = position(uncompressed_end_bit)
325        e_uncompressed_end_marker = e_uncompressed_end_marker - uncompressed_end_bit
326
327        output_pos_marker = pablo.ScanTo(singleton(output_pos), d_uncompressed_marker)
328        output_pos = position(output_pos_marker)
329
330        copy_length = uncompressed_end_pos - uncompressed_start_pos
331
332        outputStrBuffer[output_pos: output_pos + copy_length] = list(
333            file_content[uncompressed_start_pos: uncompressed_end_pos]
334        )
335        output_pos += copy_length
336
337'''
338Main
339'''
340
341if __name__ == '__main__':
342    if len(sys.argv) < 3:
343        print("""
344Usage:
345python lz4d_e_d.py [input_file_path] [output_file_path]
346        """)
347    else:
348        # Prepare Input
349        inputFile = sys.argv[1]
350        outputFile = sys.argv[2]
351        with open(inputFile, "rb") as f:
352            file_content = f.read()
353        basis_bits = Basis_bits()
354        pablo.EOF_mask = pablo.transpose_streams(file_content, basis_bits)
355
356
357        # Extract start and end marker of compressed and uncompressed blocks
358        compressed_start_marker, compressed_end_marker, uncompressed_start_marker, uncompressed_end_marker = extract_block_marker(file_content)
359        e_uncompressed_marker = uncompressed_end_marker - uncompressed_start_marker
360
361        # Imitate ParabixCharacterClassKernelBuilder to produce extenders stream
362        extender_bits = basis_bits.bit_0 & basis_bits.bit_1 & basis_bits.bit_2 & basis_bits.bit_3 & basis_bits.bit_4 & basis_bits.bit_5 & basis_bits.bit_6 & basis_bits.bit_7
363
364        # Extract e1 marker (also extract token, offset and literal marker for extracting m0)
365        token_marker, offset_marker, literal_marker, e1_marker = extract_e1_marker(file_content, basis_bits, compressed_start_marker, compressed_end_marker, extender_bits)
366
367        # Extract m0 marker for compressed block, deposit (d) marker for uncompressed block
368        uncompressed_marker = uncompressed_start_marker | uncompressed_end_marker
369        m0_marker, d_uncompressed_marker, output_length = extract_m0_and_d(file_content, literal_marker, token_marker, offset_marker, uncompressed_marker, extender_bits)
370
371        # Use Array of char to imitate output string buffer of fixed length
372        outputStrBuffer = ["" for i in range(output_length)]
373
374        # Deposit compressed and uncompressed blocks
375        deposit_compressed(file_content, compressed_start_marker, compressed_end_marker, e1_marker, m0_marker, e_uncompressed_marker, outputStrBuffer)
376        deposit_uncompressed(file_content, uncompressed_start_marker, uncompressed_end_marker, d_uncompressed_marker, outputStrBuffer)
377
378
379        outputStr = ''.join(outputStrBuffer)
380
381        with open(outputFile, 'w') as f:
382            f.write(outputStr)
383
384
385
386
Note: See TracBrowser for help on using the repository browser.