-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpa_ulex.ml
275 lines (233 loc) · 7.63 KB
/
pa_ulex.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
open Camlp4.PreCast
open Syntax
let _loc = Loc.ghost
(* Named regexp *)
let named_regexps =
(Hashtbl.create 13 : (string, Ulex.regexp) Hashtbl.t)
let () =
List.iter (fun (n,c) -> Hashtbl.add named_regexps n (Ulex.chars c))
[
"eof", Cset.eof;
"xml_letter", Cset.letter;
"xml_digit", Cset.digit;
"xml_extender", Cset.extender;
"xml_base_char", Cset.base_char;
"xml_ideographic", Cset.ideographic;
"xml_combining_char", Cset.combining_char;
"xml_blank", Cset.blank;
"tr8876_ident_char", Cset.tr8876_ident_char;
]
(* Decision tree for partitions *)
type decision_tree =
| Lte of int * decision_tree * decision_tree
| Table of int * int array
| Return of int
let decision l =
let l = List.map (fun (a,b,i) -> (a,b,Return i)) l in
let rec merge2 = function
| (a1,b1,d1) :: (a2,b2,d2) :: rest ->
let x =
if b1 + 1 = a2 then d2
else Lte (a2 - 1,Return (-1), d2)
in
(a1,b2, Lte (b1,d1, x)) :: (merge2 rest)
| rest -> rest in
let rec aux = function
| _::_::_ as l -> aux (merge2 l)
| [(a,b,d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1)))
| _ -> Return (-1)
in
aux l
let limit = 8192
let decision_table l =
let rec aux m accu = function
| ((a,b,i) as x)::rem when (b < limit && i < 255)->
aux (min a m) (x::accu) rem
| rem -> m,accu,rem in
match (aux max_int [] l : int * 'a list * 'b list) with
| _,[], _ -> decision l
| min,((_,max,_)::_ as l1), l2 ->
let arr = Array.make (max-min+1) 0 in
List.iter (fun (a,b,i) -> for j = a to b do arr.(j-min) <- i + 1 done) l1;
Lte (min-1, Return (-1), Lte (max, Table (min,arr), decision l2))
let rec simplify min max = function
| Lte (i,yes,no) ->
if i >= max then simplify min max yes
else if i < min then simplify min max no
else Lte (i, simplify min i yes, simplify (i+1) max no)
| x -> x
let tables = Hashtbl.create 31
let tables_counter = ref 0
let get_tables () =
let t = Hashtbl.fold (fun key x accu -> (x,key)::accu) tables [] in
Hashtbl.clear tables;
t
let table_name t =
try Hashtbl.find tables t
with Not_found ->
incr tables_counter;
let n = Printf.sprintf "__ulex_table_%i" !tables_counter in
Hashtbl.add tables t n;
n
let output_byte buf b =
Buffer.add_char buf '\\';
Buffer.add_char buf (Char.chr(48 + b / 100));
Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));
Buffer.add_char buf (Char.chr(48 + b mod 10))
let output_byte_array v =
let b = Buffer.create (Array.length v * 5) in
for i = 0 to Array.length v - 1 do
output_byte b (v.(i) land 0xFF);
if i land 15 = 15 then Buffer.add_string b "\\\n "
done;
let s = Buffer.contents b in
<:expr< $str:s$ >>
let table (n,t) = <:str_item< value $lid:n$ = $output_byte_array t$ >>
let partition_name i = Printf.sprintf "__ulex_partition_%i" i
let partition (i,p) =
let rec gen_tree = function
| Lte (i,yes,no) ->
<:expr< if (c <= $`int:i$)
then $gen_tree yes$ else $gen_tree no$ >>
| Return i ->
<:expr< $`int:i$ >>
| Table (offset, t) ->
let c = if offset = 0 then <:expr< c >>
else <:expr< (c - $`int:offset$) >> in
<:expr< Char.code ($lid: table_name t$.[$c$]) - 1>>
in
let body = gen_tree (simplify (-1) (Cset.max_code) (decision_table p)) in
let f = partition_name i in
<:str_item< value $lid:f$ = fun c -> $body$ >>
(* Code generation for the automata *)
let best_final final =
let fin = ref None in
Array.iteri
(fun i b -> if b && (!fin = None) then fin := Some i) final;
!fin
let call_state auto state =
match auto.(state) with (_,trans,final) ->
if Array.length trans = 0
then match best_final final with
| Some i -> <:expr< $`int:i$ >>
| None -> assert false
else
let f = Printf.sprintf "__ulex_state_%i" state in
<:expr< $lid:f$ lexbuf >>
let gen_state auto _loc i (part,trans,final) =
let f = Printf.sprintf "__ulex_state_%i" i in
let p = partition_name part in
let cases =
Array.mapi
(fun i j -> <:match_case< $`int:i$ -> $call_state auto j$ >>)
trans in
let cases = Array.to_list cases in
let body =
<:expr<
match ($lid:p$ (Ulexing.next lexbuf)) with
[ $list:cases$
| _ -> Ulexing.backtrack lexbuf ] >> in
let ret body =
<:binding< $lid:f$ = fun lexbuf -> $body$ >> in
match best_final final with
| None -> ret body
| Some i ->
if Array.length trans = 0 then <:binding<>> else
ret
<:expr< do { Ulexing.mark lexbuf $`int:i$; $body$ } >>
let gen_definition _loc l =
let brs = Array.of_list l in
let rs = Array.map fst brs in
let auto = Ulex.compile rs in
let cases = Array.mapi (fun i (_,e) -> <:match_case< $`int:i$ -> $e$ >>) brs in
let states = Array.mapi (gen_state auto _loc) auto in
<:expr< fun lexbuf ->
let rec $list:Array.to_list states$ in
do { Ulexing.start lexbuf;
match __ulex_state_0 lexbuf with
[ $list:Array.to_list cases$ | _ -> raise Ulexing.Error ] } >>
(* Lexer specification parser *)
let char_int s =
let i = int_of_string s in
if (i >=0) && (i <= Cset.max_code) then i
else failwith ("Invalid Unicode code point: " ^ s)
let regexp_for_string s =
let rec aux n =
if n = String.length s then Ulex.eps
else
Ulex.seq (Ulex.chars (Cset.singleton (Char.code s.[n]))) (aux (succ n))
in aux 0
EXTEND Gram
GLOBAL: expr str_item;
expr: [
[ "lexer";
OPT "|"; l = LIST0 [ r=regexp; "->"; a=expr -> (r,a) ] SEP "|" ->
gen_definition _loc l ]
];
str_item: [
[ "let"; LIDENT "regexp"; x = LIDENT; "="; r = regexp ->
if Hashtbl.mem named_regexps x then
Printf.eprintf
"pa_ulex (warning): multiple definition of named regexp '%s'\n"
x;
Hashtbl.add named_regexps x r;
<:str_item<>>
]
];
regexp: [
[ r1 = regexp; "|"; r2 = regexp -> Ulex.alt r1 r2 ]
| [ r1 = regexp; r2 = regexp -> Ulex.seq r1 r2 ]
| [ r1 = regexp; "#"; r2 = regexp ->
try Ulex.diff r1 r2
with Not_found ->
failwith
("pa_ulex (error): operands of # must be bare character sets") ]
| [ r = regexp; "*" -> Ulex.rep r
| r = regexp; "+" -> Ulex.plus r
| r = regexp; "?" -> Ulex.alt Ulex.eps r
| "("; r = regexp; ")" -> r
| "_" -> Ulex.chars Cset.any
| c = chr -> Ulex.chars (Cset.singleton c)
| `STRING (s,_) -> regexp_for_string s
| "["; cc = ch_class; "]" -> Ulex.chars cc
| "[^"; cc = ch_class; "]" -> Ulex.chars (Cset.difference Cset.any cc)
| x = LIDENT ->
try Hashtbl.find named_regexps x
with Not_found ->
failwith
("pa_ulex (error): reference to unbound regexp name `"^x^"'")
]
];
chr: [
[ `CHAR (c,_) -> Char.code c
| i = INT -> char_int i ]
];
ch_class: [
[ c1 = chr; "-"; c2 = chr -> Cset.interval c1 c2
| c = chr -> Cset.singleton c
| cc1 = ch_class; cc2 = ch_class -> Cset.union cc1 cc2
| `STRING (s,_) ->
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
!c
]
];
END
let change_ids suffix = object
inherit Ast.map as super
method ident = function
| Ast.IdLid (loc, s) when String.length s > 6 && String.sub s 0 6 = "__ulex" -> Ast.IdLid (loc, s ^ suffix)
| i -> i
end
let () =
let first = ref true in
AstFilters.register_str_item_filter
(fun s ->
assert(!first); first := false;
let parts = List.map partition (Ulex.partitions ()) in
let tables = List.map table (get_tables ()) in
let suffix = "__" ^ Digest.to_hex (Digest.string (Marshal.to_string (parts, tables) [])) in
(change_ids suffix) # str_item <:str_item< $list:tables$; $list:parts$; $s$ >>
)