1 | """Tokenization help for Python programs. |
---|
2 | |
---|
3 | generate_tokens(readline) is a generator that breaks a stream of |
---|
4 | text into Python tokens. It accepts a readline-like method which is called |
---|
5 | repeatedly to get the next line of input (or "" for EOF). It generates |
---|
6 | 5-tuples with these members: |
---|
7 | |
---|
8 | the token type (see token.py) |
---|
9 | the token (a string) |
---|
10 | the starting (row, column) indices of the token (a 2-tuple of ints) |
---|
11 | the ending (row, column) indices of the token (a 2-tuple of ints) |
---|
12 | the original line (string) |
---|
13 | |
---|
14 | It is designed to match the working of the Python tokenizer exactly, except |
---|
15 | that it produces COMMENT tokens for comments and gives type OP for all |
---|
16 | operators |
---|
17 | |
---|
18 | Older entry points |
---|
19 | tokenize_loop(readline, tokeneater) |
---|
20 | tokenize(readline, tokeneater=printtoken) |
---|
21 | are the same, except instead of generating tokens, tokeneater is a callback |
---|
22 | function to which the 5 fields described above are passed as 5 arguments, |
---|
23 | each time a new token is found.""" |
---|
24 | |
---|
25 | __author__ = 'Ka-Ping Yee <ping@lfw.org>' |
---|
26 | __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' |
---|
27 | 'Skip Montanaro, Raymond Hettinger') |
---|
28 | |
---|
29 | from itertools import chain |
---|
30 | import string, re |
---|
31 | from token import * |
---|
32 | |
---|
33 | import token |
---|
34 | __all__ = [x for x in dir(token) if not x.startswith("_")] |
---|
35 | __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] |
---|
36 | del x |
---|
37 | del token |
---|
38 | |
---|
39 | COMMENT = N_TOKENS |
---|
40 | tok_name[COMMENT] = 'COMMENT' |
---|
41 | NL = N_TOKENS + 1 |
---|
42 | tok_name[NL] = 'NL' |
---|
43 | N_TOKENS += 2 |
---|
44 | |
---|
45 | def group(*choices): return '(' + '|'.join(choices) + ')' |
---|
46 | def any(*choices): return group(*choices) + '*' |
---|
47 | def maybe(*choices): return group(*choices) + '?' |
---|
48 | |
---|
49 | Whitespace = r'[ \f\t]*' |
---|
50 | Comment = r'#[^\r\n]*' |
---|
51 | Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) |
---|
52 | Name = r'[a-zA-Z_]\w*' |
---|
53 | |
---|
54 | Hexnumber = r'0[xX][\da-fA-F]+[lL]?' |
---|
55 | Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' |
---|
56 | Binnumber = r'0[bB][01]+[lL]?' |
---|
57 | Decnumber = r'[1-9]\d*[lL]?' |
---|
58 | Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) |
---|
59 | Exponent = r'[eE][-+]?\d+' |
---|
60 | Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) |
---|
61 | Expfloat = r'\d+' + Exponent |
---|
62 | Floatnumber = group(Pointfloat, Expfloat) |
---|
63 | Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') |
---|
64 | Number = group(Imagnumber, Floatnumber, Intnumber) |
---|
65 | |
---|
66 | # Tail end of ' string. |
---|
67 | Single = r"[^'\\]*(?:\\.[^'\\]*)*'" |
---|
68 | # Tail end of " string. |
---|
69 | Double = r'[^"\\]*(?:\\.[^"\\]*)*"' |
---|
70 | # Tail end of ''' string. |
---|
71 | Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" |
---|
72 | # Tail end of """ string. |
---|
73 | Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' |
---|
74 | Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') |
---|
75 | # Single-line ' or " string. |
---|
76 | String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", |
---|
77 | r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') |
---|
78 | |
---|
79 | # Because of leftmost-then-longest match semantics, be sure to put the |
---|
80 | # longest operators first (e.g., if = came before ==, == would get |
---|
81 | # recognized as two instances of =). |
---|
82 | Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", |
---|
83 | r"//=?", |
---|
84 | r"[+\-*/%&|^=<>]=?", |
---|
85 | r"~") |
---|
86 | |
---|
87 | Bracket = '[][(){}]' |
---|
88 | Special = group(r'\r?\n', r'[:;.,`@]') |
---|
89 | Funny = group(Operator, Bracket, Special) |
---|
90 | |
---|
91 | PlainToken = group(Number, Funny, String, Name) |
---|
92 | Token = Ignore + PlainToken |
---|
93 | |
---|
94 | # First (or only) line of ' or " string. |
---|
95 | ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + |
---|
96 | group("'", r'\\\r?\n'), |
---|
97 | r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + |
---|
98 | group('"', r'\\\r?\n')) |
---|
99 | PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) |
---|
100 | PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) |
---|
101 | |
---|
102 | tokenprog, pseudoprog, single3prog, double3prog = map( |
---|
103 | re.compile, (Token, PseudoToken, Single3, Double3)) |
---|
104 | endprogs = {"'": re.compile(Single), '"': re.compile(Double), |
---|
105 | "'''": single3prog, '"""': double3prog, |
---|
106 | "r'''": single3prog, 'r"""': double3prog, |
---|
107 | "u'''": single3prog, 'u"""': double3prog, |
---|
108 | "ur'''": single3prog, 'ur"""': double3prog, |
---|
109 | "R'''": single3prog, 'R"""': double3prog, |
---|
110 | "U'''": single3prog, 'U"""': double3prog, |
---|
111 | "uR'''": single3prog, 'uR"""': double3prog, |
---|
112 | "Ur'''": single3prog, 'Ur"""': double3prog, |
---|
113 | "UR'''": single3prog, 'UR"""': double3prog, |
---|
114 | "b'''": single3prog, 'b"""': double3prog, |
---|
115 | "br'''": single3prog, 'br"""': double3prog, |
---|
116 | "B'''": single3prog, 'B"""': double3prog, |
---|
117 | "bR'''": single3prog, 'bR"""': double3prog, |
---|
118 | "Br'''": single3prog, 'Br"""': double3prog, |
---|
119 | "BR'''": single3prog, 'BR"""': double3prog, |
---|
120 | 'r': None, 'R': None, 'u': None, 'U': None, |
---|
121 | 'b': None, 'B': None} |
---|
122 | |
---|
123 | triple_quoted = {} |
---|
124 | for t in ("'''", '"""', |
---|
125 | "r'''", 'r"""', "R'''", 'R"""', |
---|
126 | "u'''", 'u"""', "U'''", 'U"""', |
---|
127 | "ur'''", 'ur"""', "Ur'''", 'Ur"""', |
---|
128 | "uR'''", 'uR"""', "UR'''", 'UR"""', |
---|
129 | "b'''", 'b"""', "B'''", 'B"""', |
---|
130 | "br'''", 'br"""', "Br'''", 'Br"""', |
---|
131 | "bR'''", 'bR"""', "BR'''", 'BR"""'): |
---|
132 | triple_quoted[t] = t |
---|
133 | single_quoted = {} |
---|
134 | for t in ("'", '"', |
---|
135 | "r'", 'r"', "R'", 'R"', |
---|
136 | "u'", 'u"', "U'", 'U"', |
---|
137 | "ur'", 'ur"', "Ur'", 'Ur"', |
---|
138 | "uR'", 'uR"', "UR'", 'UR"', |
---|
139 | "b'", 'b"', "B'", 'B"', |
---|
140 | "br'", 'br"', "Br'", 'Br"', |
---|
141 | "bR'", 'bR"', "BR'", 'BR"' ): |
---|
142 | single_quoted[t] = t |
---|
143 | |
---|
144 | tabsize = 8 |
---|
145 | |
---|
146 | class TokenError(Exception): pass |
---|
147 | |
---|
148 | class StopTokenizing(Exception): pass |
---|
149 | |
---|
150 | def printtoken(type, token, srow_scol, erow_ecol, line): # for testing |
---|
151 | srow, scol = srow_scol |
---|
152 | erow, ecol = erow_ecol |
---|
153 | print "%d,%d-%d,%d:\t%s\t%s" % \ |
---|
154 | (srow, scol, erow, ecol, tok_name[type], repr(token)) |
---|
155 | |
---|
156 | def tokenize(readline, tokeneater=printtoken): |
---|
157 | """ |
---|
158 | The tokenize() function accepts two parameters: one representing the |
---|
159 | input stream, and one providing an output mechanism for tokenize(). |
---|
160 | |
---|
161 | The first parameter, readline, must be a callable object which provides |
---|
162 | the same interface as the readline() method of built-in file objects. |
---|
163 | Each call to the function should return one line of input as a string. |
---|
164 | |
---|
165 | The second parameter, tokeneater, must also be a callable object. It is |
---|
166 | called once for each token, with five arguments, corresponding to the |
---|
167 | tuples generated by generate_tokens(). |
---|
168 | """ |
---|
169 | try: |
---|
170 | tokenize_loop(readline, tokeneater) |
---|
171 | except StopTokenizing: |
---|
172 | pass |
---|
173 | |
---|
174 | # backwards compatible interface |
---|
175 | def tokenize_loop(readline, tokeneater): |
---|
176 | for token_info in generate_tokens(readline): |
---|
177 | tokeneater(*token_info) |
---|
178 | |
---|
179 | class Untokenizer: |
---|
180 | |
---|
181 | def __init__(self): |
---|
182 | self.tokens = [] |
---|
183 | self.prev_row = 1 |
---|
184 | self.prev_col = 0 |
---|
185 | |
---|
186 | def add_whitespace(self, start): |
---|
187 | row, col = start |
---|
188 | if row < self.prev_row or row == self.prev_row and col < self.prev_col: |
---|
189 | raise ValueError("start ({},{}) precedes previous end ({},{})" |
---|
190 | .format(row, col, self.prev_row, self.prev_col)) |
---|
191 | row_offset = row - self.prev_row |
---|
192 | if row_offset: |
---|
193 | self.tokens.append("\\\n" * row_offset) |
---|
194 | self.prev_col = 0 |
---|
195 | col_offset = col - self.prev_col |
---|
196 | if col_offset: |
---|
197 | self.tokens.append(" " * col_offset) |
---|
198 | |
---|
199 | def untokenize(self, iterable): |
---|
200 | it = iter(iterable) |
---|
201 | indents = [] |
---|
202 | startline = False |
---|
203 | for t in it: |
---|
204 | if len(t) == 2: |
---|
205 | self.compat(t, it) |
---|
206 | break |
---|
207 | tok_type, token, start, end, line = t |
---|
208 | if tok_type == ENDMARKER: |
---|
209 | break |
---|
210 | if tok_type == INDENT: |
---|
211 | indents.append(token) |
---|
212 | continue |
---|
213 | elif tok_type == DEDENT: |
---|
214 | indents.pop() |
---|
215 | self.prev_row, self.prev_col = end |
---|
216 | continue |
---|
217 | elif tok_type in (NEWLINE, NL): |
---|
218 | startline = True |
---|
219 | elif startline and indents: |
---|
220 | indent = indents[-1] |
---|
221 | if start[1] >= len(indent): |
---|
222 | self.tokens.append(indent) |
---|
223 | self.prev_col = len(indent) |
---|
224 | startline = False |
---|
225 | self.add_whitespace(start) |
---|
226 | self.tokens.append(token) |
---|
227 | self.prev_row, self.prev_col = end |
---|
228 | if tok_type in (NEWLINE, NL): |
---|
229 | self.prev_row += 1 |
---|
230 | self.prev_col = 0 |
---|
231 | return "".join(self.tokens) |
---|
232 | |
---|
233 | def compat(self, token, iterable): |
---|
234 | indents = [] |
---|
235 | toks_append = self.tokens.append |
---|
236 | startline = token[0] in (NEWLINE, NL) |
---|
237 | prevstring = False |
---|
238 | |
---|
239 | for tok in chain([token], iterable): |
---|
240 | toknum, tokval = tok[:2] |
---|
241 | |
---|
242 | if toknum in (NAME, NUMBER): |
---|
243 | tokval += ' ' |
---|
244 | |
---|
245 | # Insert a space between two consecutive strings |
---|
246 | if toknum == STRING: |
---|
247 | if prevstring: |
---|
248 | tokval = ' ' + tokval |
---|
249 | prevstring = True |
---|
250 | else: |
---|
251 | prevstring = False |
---|
252 | |
---|
253 | if toknum == INDENT: |
---|
254 | indents.append(tokval) |
---|
255 | continue |
---|
256 | elif toknum == DEDENT: |
---|
257 | indents.pop() |
---|
258 | continue |
---|
259 | elif toknum in (NEWLINE, NL): |
---|
260 | startline = True |
---|
261 | elif startline and indents: |
---|
262 | toks_append(indents[-1]) |
---|
263 | startline = False |
---|
264 | toks_append(tokval) |
---|
265 | |
---|
266 | def untokenize(iterable): |
---|
267 | """Transform tokens back into Python source code. |
---|
268 | |
---|
269 | Each element returned by the iterable must be a token sequence |
---|
270 | with at least two elements, a token number and token value. If |
---|
271 | only two tokens are passed, the resulting output is poor. |
---|
272 | |
---|
273 | Round-trip invariant for full input: |
---|
274 | Untokenized source will match input source exactly |
---|
275 | |
---|
276 | Round-trip invariant for limited intput: |
---|
277 | # Output text will tokenize the back to the input |
---|
278 | t1 = [tok[:2] for tok in generate_tokens(f.readline)] |
---|
279 | newcode = untokenize(t1) |
---|
280 | readline = iter(newcode.splitlines(1)).next |
---|
281 | t2 = [tok[:2] for tok in generate_tokens(readline)] |
---|
282 | assert t1 == t2 |
---|
283 | """ |
---|
284 | ut = Untokenizer() |
---|
285 | return ut.untokenize(iterable) |
---|
286 | |
---|
287 | def generate_tokens(readline): |
---|
288 | """ |
---|
289 | The generate_tokens() generator requires one argument, readline, which |
---|
290 | must be a callable object which provides the same interface as the |
---|
291 | readline() method of built-in file objects. Each call to the function |
---|
292 | should return one line of input as a string. Alternately, readline |
---|
293 | can be a callable function terminating with StopIteration: |
---|
294 | readline = open(myfile).next # Example of alternate readline |
---|
295 | |
---|
296 | The generator produces 5-tuples with these members: the token type; the |
---|
297 | token string; a 2-tuple (srow, scol) of ints specifying the row and |
---|
298 | column where the token begins in the source; a 2-tuple (erow, ecol) of |
---|
299 | ints specifying the row and column where the token ends in the source; |
---|
300 | and the line on which the token was found. The line passed is the |
---|
301 | logical line; continuation lines are included. |
---|
302 | """ |
---|
303 | lnum = parenlev = continued = 0 |
---|
304 | namechars, numchars = string.ascii_letters + '_', '0123456789' |
---|
305 | contstr, needcont = '', 0 |
---|
306 | contline = None |
---|
307 | indents = [0] |
---|
308 | |
---|
309 | while 1: # loop over lines in stream |
---|
310 | try: |
---|
311 | line = readline() |
---|
312 | except StopIteration: |
---|
313 | line = '' |
---|
314 | lnum += 1 |
---|
315 | pos, max = 0, len(line) |
---|
316 | |
---|
317 | if contstr: # continued string |
---|
318 | if not line: |
---|
319 | raise TokenError, ("EOF in multi-line string", strstart) |
---|
320 | endmatch = endprog.match(line) |
---|
321 | if endmatch: |
---|
322 | pos = end = endmatch.end(0) |
---|
323 | yield (STRING, contstr + line[:end], |
---|
324 | strstart, (lnum, end), contline + line) |
---|
325 | contstr, needcont = '', 0 |
---|
326 | contline = None |
---|
327 | elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': |
---|
328 | yield (ERRORTOKEN, contstr + line, |
---|
329 | strstart, (lnum, len(line)), contline) |
---|
330 | contstr = '' |
---|
331 | contline = None |
---|
332 | continue |
---|
333 | else: |
---|
334 | contstr = contstr + line |
---|
335 | contline = contline + line |
---|
336 | continue |
---|
337 | |
---|
338 | elif parenlev == 0 and not continued: # new statement |
---|
339 | if not line: break |
---|
340 | column = 0 |
---|
341 | while pos < max: # measure leading whitespace |
---|
342 | if line[pos] == ' ': |
---|
343 | column += 1 |
---|
344 | elif line[pos] == '\t': |
---|
345 | column = (column//tabsize + 1)*tabsize |
---|
346 | elif line[pos] == '\f': |
---|
347 | column = 0 |
---|
348 | else: |
---|
349 | break |
---|
350 | pos += 1 |
---|
351 | if pos == max: |
---|
352 | break |
---|
353 | |
---|
354 | if line[pos] in '#\r\n': # skip comments or blank lines |
---|
355 | if line[pos] == '#': |
---|
356 | comment_token = line[pos:].rstrip('\r\n') |
---|
357 | nl_pos = pos + len(comment_token) |
---|
358 | yield (COMMENT, comment_token, |
---|
359 | (lnum, pos), (lnum, pos + len(comment_token)), line) |
---|
360 | yield (NL, line[nl_pos:], |
---|
361 | (lnum, nl_pos), (lnum, len(line)), line) |
---|
362 | else: |
---|
363 | yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], |
---|
364 | (lnum, pos), (lnum, len(line)), line) |
---|
365 | continue |
---|
366 | |
---|
367 | if column > indents[-1]: # count indents or dedents |
---|
368 | indents.append(column) |
---|
369 | yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) |
---|
370 | while column < indents[-1]: |
---|
371 | if column not in indents: |
---|
372 | raise IndentationError( |
---|
373 | "unindent does not match any outer indentation level", |
---|
374 | ("<tokenize>", lnum, pos, line)) |
---|
375 | indents = indents[:-1] |
---|
376 | yield (DEDENT, '', (lnum, pos), (lnum, pos), line) |
---|
377 | |
---|
378 | else: # continued statement |
---|
379 | if not line: |
---|
380 | raise TokenError, ("EOF in multi-line statement", (lnum, 0)) |
---|
381 | continued = 0 |
---|
382 | |
---|
383 | while pos < max: |
---|
384 | pseudomatch = pseudoprog.match(line, pos) |
---|
385 | if pseudomatch: # scan for tokens |
---|
386 | start, end = pseudomatch.span(1) |
---|
387 | spos, epos, pos = (lnum, start), (lnum, end), end |
---|
388 | if start == end: |
---|
389 | continue |
---|
390 | token, initial = line[start:end], line[start] |
---|
391 | |
---|
392 | if initial in numchars or \ |
---|
393 | (initial == '.' and token != '.'): # ordinary number |
---|
394 | yield (NUMBER, token, spos, epos, line) |
---|
395 | elif initial in '\r\n': |
---|
396 | yield (NL if parenlev > 0 else NEWLINE, |
---|
397 | token, spos, epos, line) |
---|
398 | elif initial == '#': |
---|
399 | assert not token.endswith("\n") |
---|
400 | yield (COMMENT, token, spos, epos, line) |
---|
401 | elif token in triple_quoted: |
---|
402 | endprog = endprogs[token] |
---|
403 | endmatch = endprog.match(line, pos) |
---|
404 | if endmatch: # all on one line |
---|
405 | pos = endmatch.end(0) |
---|
406 | token = line[start:pos] |
---|
407 | yield (STRING, token, spos, (lnum, pos), line) |
---|
408 | else: |
---|
409 | strstart = (lnum, start) # multiple lines |
---|
410 | contstr = line[start:] |
---|
411 | contline = line |
---|
412 | break |
---|
413 | elif initial in single_quoted or \ |
---|
414 | token[:2] in single_quoted or \ |
---|
415 | token[:3] in single_quoted: |
---|
416 | if token[-1] == '\n': # continued string |
---|
417 | strstart = (lnum, start) |
---|
418 | endprog = (endprogs[initial] or endprogs[token[1]] or |
---|
419 | endprogs[token[2]]) |
---|
420 | contstr, needcont = line[start:], 1 |
---|
421 | contline = line |
---|
422 | break |
---|
423 | else: # ordinary string |
---|
424 | yield (STRING, token, spos, epos, line) |
---|
425 | elif initial in namechars: # ordinary name |
---|
426 | yield (NAME, token, spos, epos, line) |
---|
427 | elif initial == '\\': # continued stmt |
---|
428 | continued = 1 |
---|
429 | else: |
---|
430 | if initial in '([{': |
---|
431 | parenlev += 1 |
---|
432 | elif initial in ')]}': |
---|
433 | parenlev -= 1 |
---|
434 | yield (OP, token, spos, epos, line) |
---|
435 | else: |
---|
436 | yield (ERRORTOKEN, line[pos], |
---|
437 | (lnum, pos), (lnum, pos+1), line) |
---|
438 | pos += 1 |
---|
439 | |
---|
440 | for indent in indents[1:]: # pop remaining indent levels |
---|
441 | yield (DEDENT, '', (lnum, 0), (lnum, 0), '') |
---|
442 | yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') |
---|
443 | |
---|
444 | if __name__ == '__main__': # testing |
---|
445 | import sys |
---|
446 | if len(sys.argv) > 1: |
---|
447 | tokenize(open(sys.argv[1]).readline) |
---|
448 | else: |
---|
449 | tokenize(sys.stdin.readline) |
---|