-
Notifications
You must be signed in to change notification settings - Fork 2.5k
/
driver.py
314 lines (266 loc) · 10.3 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser driver.
This provides a high-level interface to parse a file into a syntax tree.
"""
__author__ = "Guido van Rossum <[email protected]>"
__all__ = ["Driver", "load_grammar"]
# Python imports
import io
import logging
import os
import pkgutil
import sys
from contextlib import contextmanager
from dataclasses import dataclass, field
from logging import Logger
from typing import IO, Any, Iterable, Iterator, List, Optional, Tuple, Union, cast
from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.tokenize import GoodTokenInfo
from blib2to3.pytree import NL
# Pgen imports
from . import grammar, parse, pgen, token, tokenize
Path = Union[str, "os.PathLike[str]"]
@dataclass
class ReleaseRange:
start: int
end: Optional[int] = None
tokens: List[Any] = field(default_factory=list)
def lock(self) -> None:
total_eaten = len(self.tokens)
self.end = self.start + total_eaten
class TokenProxy:
def __init__(self, generator: Any) -> None:
self._tokens = generator
self._counter = 0
self._release_ranges: List[ReleaseRange] = []
@contextmanager
def release(self) -> Iterator["TokenProxy"]:
release_range = ReleaseRange(self._counter)
self._release_ranges.append(release_range)
try:
yield self
finally:
# Lock the last release range to the final position that
# has been eaten.
release_range.lock()
def eat(self, point: int) -> Any:
eaten_tokens = self._release_ranges[-1].tokens
if point < len(eaten_tokens):
return eaten_tokens[point]
else:
while point >= len(eaten_tokens):
token = next(self._tokens)
eaten_tokens.append(token)
return token
def __iter__(self) -> "TokenProxy":
return self
def __next__(self) -> Any:
# If the current position is already compromised (looked up)
# return the eaten token, if not just go further on the given
# token producer.
for release_range in self._release_ranges:
assert release_range.end is not None
start, end = release_range.start, release_range.end
if start <= self._counter < end:
token = release_range.tokens[self._counter - start]
break
else:
token = next(self._tokens)
self._counter += 1
return token
def can_advance(self, to: int) -> bool:
# Try to eat, fail if it can't. The eat operation is cached
# so there won't be any additional cost of eating here
try:
self.eat(to)
except StopIteration:
return False
else:
return True
class Driver:
def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
self.grammar = grammar
if logger is None:
logger = logging.getLogger(__name__)
self.logger = logger
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
"""Parse a series of tokens and return the syntax tree."""
# XXX Move the prefix computation into a wrapper around tokenize.
proxy = TokenProxy(tokens)
p = parse.Parser(self.grammar)
p.setup(proxy=proxy)
lineno = 1
column = 0
indent_columns: List[int] = []
type = value = start = end = line_text = None
prefix = ""
for quintuple in proxy:
type, value, start, end, line_text = quintuple
if start != (lineno, column):
assert (lineno, column) <= start, ((lineno, column), start)
s_lineno, s_column = start
if lineno < s_lineno:
prefix += "\n" * (s_lineno - lineno)
lineno = s_lineno
column = 0
if column < s_column:
prefix += line_text[column:s_column]
column = s_column
if type in (tokenize.COMMENT, tokenize.NL):
prefix += value
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
continue
if type == token.OP:
type = grammar.opmap[value]
if debug:
assert type is not None
self.logger.debug(
"%s %r (prefix=%r)", token.tok_name[type], value, prefix
)
if type == token.INDENT:
indent_columns.append(len(value))
_prefix = prefix + value
prefix = ""
value = ""
elif type == token.DEDENT:
_indent_col = indent_columns.pop()
prefix, _prefix = self._partially_consume_prefix(prefix, _indent_col)
if p.addtoken(cast(int, type), value, (prefix, start)):
if debug:
self.logger.debug("Stop.")
break
prefix = ""
if type in {token.INDENT, token.DEDENT}:
prefix = _prefix
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
else:
# We never broke out -- EOF is too soon (how can this happen???)
assert start is not None
raise parse.ParseError("incomplete input", type, value, (prefix, start))
assert p.rootnode is not None
return p.rootnode
def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
return self.parse_tokens(tokens, debug)
def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
def parse_file(
self, filename: Path, encoding: Optional[str] = None, debug: bool = False
) -> NL:
"""Parse a file and return the syntax tree."""
with open(filename, encoding=encoding) as stream:
return self.parse_stream(stream, debug)
def parse_string(self, text: str, debug: bool = False) -> NL:
"""Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens(
io.StringIO(text).readline, grammar=self.grammar
)
return self.parse_tokens(tokens, debug)
def _partially_consume_prefix(self, prefix: str, column: int) -> Tuple[str, str]:
lines: List[str] = []
current_line = ""
current_column = 0
wait_for_nl = False
for char in prefix:
current_line += char
if wait_for_nl:
if char == "\n":
if current_line.strip() and current_column < column:
res = "".join(lines)
return res, prefix[len(res) :]
lines.append(current_line)
current_line = ""
current_column = 0
wait_for_nl = False
elif char in " \t":
current_column += 1
elif char == "\n":
# unexpected empty line
current_column = 0
else:
# indent is finished
wait_for_nl = True
return "".join(lines), current_line
def _generate_pickle_name(gt: Path, cache_dir: Optional[Path] = None) -> str:
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
name = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
if cache_dir:
return os.path.join(cache_dir, os.path.basename(name))
else:
return name
def load_grammar(
gt: str = "Grammar.txt",
gp: Optional[str] = None,
save: bool = True,
force: bool = False,
logger: Optional[Logger] = None,
) -> Grammar:
"""Load the grammar (maybe from a pickle)."""
if logger is None:
logger = logging.getLogger(__name__)
gp = _generate_pickle_name(gt) if gp is None else gp
if force or not _newer(gp, gt):
g: grammar.Grammar = pgen.generate_grammar(gt)
if save:
try:
g.dump(gp)
except OSError:
# Ignore error, caching is not vital.
pass
else:
g = grammar.Grammar()
g.load(gp)
return g
def _newer(a: str, b: str) -> bool:
"""Inquire whether file a was written since file b."""
if not os.path.exists(a):
return False
if not os.path.exists(b):
return True
return os.path.getmtime(a) >= os.path.getmtime(b)
def load_packaged_grammar(
package: str, grammar_source: str, cache_dir: Optional[Path] = None
) -> grammar.Grammar:
"""Normally, loads a pickled grammar by doing
pkgutil.get_data(package, pickled_grammar)
where *pickled_grammar* is computed from *grammar_source* by adding the
Python version and using a ``.pickle`` extension.
However, if *grammar_source* is an extant file, load_grammar(grammar_source)
is called instead. This facilitates using a packaged grammar file when needed
but preserves load_grammar's automatic regeneration behavior when possible.
"""
if os.path.isfile(grammar_source):
gp = _generate_pickle_name(grammar_source, cache_dir) if cache_dir else None
return load_grammar(grammar_source, gp=gp)
pickled_name = _generate_pickle_name(os.path.basename(grammar_source), cache_dir)
data = pkgutil.get_data(package, pickled_name)
assert data is not None
g = grammar.Grammar()
g.loads(data)
return g
def main(*args: str) -> bool:
"""Main program, when run as a script: produce grammar pickle files.
Calls load_grammar for each argument, a path to a grammar text file.
"""
if not args:
args = tuple(sys.argv[1:])
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
for gt in args:
load_grammar(gt, save=True, force=True)
return True
if __name__ == "__main__":
sys.exit(int(not main()))