source: icGREP/icgrep-devel/icgrep/utf8_encoder.h @ 4129

Last change on this file since 4129 was 3961, checked in by daled, 5 years ago

Multibyte character code classes parsed from hex notation are now using named byte sequences.

File size: 902 bytes
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef UTF8_ENCODER_H
8#define UTF8_ENCODER_H
9
10//Regular Expressions
11#include "re_re.h"
12#include "re_cc.h"
13#include "re_name.h"
14#include "re_start.h"
15#include "re_end.h"
16#include "re_seq.h"
17#include "re_alt.h"
18#include "re_rep.h"
19
20#include "re_simplifier.h"
21
22class UTF8_Encoder
23{
24public:
25    static RE* toUTF8(RE* re);
26private:
27    static RE* rangeToUTF8(CharSetItem item);
28    static RE* rangeToUTF8_helper(int lo, int hi, int n, int hlen);
29    static CC* makeByteClass(int byteval);
30    static CC* makeByteRange(int lo, int hi);
31
32    static bool u8Prefix(int cp);
33    static int u8len(int cp);
34    static int max_of_u8len(int lgth);
35    static int u8byte(int codepoint, int n);
36};
37
38#endif // UTF8_ENCODER_H
Note: See TracBrowser for help on using the repository browser.