1 | """Implementation of JSONDecoder |
---|
2 | """ |
---|
3 | import re |
---|
4 | import sys |
---|
5 | import struct |
---|
6 | |
---|
7 | from json import scanner |
---|
8 | try: |
---|
9 | from _json import scanstring as c_scanstring |
---|
10 | except ImportError: |
---|
11 | c_scanstring = None |
---|
12 | |
---|
13 | __all__ = ['JSONDecoder'] |
---|
14 | |
---|
15 | FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL |
---|
16 | |
---|
17 | def _floatconstants(): |
---|
18 | nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00') |
---|
19 | inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00') |
---|
20 | return nan, inf, -inf |
---|
21 | |
---|
22 | NaN, PosInf, NegInf = _floatconstants() |
---|
23 | |
---|
24 | |
---|
25 | def linecol(doc, pos): |
---|
26 | lineno = doc.count('\n', 0, pos) + 1 |
---|
27 | if lineno == 1: |
---|
28 | colno = pos + 1 |
---|
29 | else: |
---|
30 | colno = pos - doc.rindex('\n', 0, pos) |
---|
31 | return lineno, colno |
---|
32 | |
---|
33 | |
---|
34 | def errmsg(msg, doc, pos, end=None): |
---|
35 | # Note that this function is called from _json |
---|
36 | lineno, colno = linecol(doc, pos) |
---|
37 | if end is None: |
---|
38 | fmt = '{0}: line {1} column {2} (char {3})' |
---|
39 | return fmt.format(msg, lineno, colno, pos) |
---|
40 | #fmt = '%s: line %d column %d (char %d)' |
---|
41 | #return fmt % (msg, lineno, colno, pos) |
---|
42 | endlineno, endcolno = linecol(doc, end) |
---|
43 | fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' |
---|
44 | return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) |
---|
45 | #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' |
---|
46 | #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) |
---|
47 | |
---|
48 | |
---|
49 | _CONSTANTS = { |
---|
50 | '-Infinity': NegInf, |
---|
51 | 'Infinity': PosInf, |
---|
52 | 'NaN': NaN, |
---|
53 | } |
---|
54 | |
---|
55 | STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) |
---|
56 | BACKSLASH = { |
---|
57 | '"': u'"', '\\': u'\\', '/': u'/', |
---|
58 | 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', |
---|
59 | } |
---|
60 | |
---|
61 | DEFAULT_ENCODING = "utf-8" |
---|
62 | |
---|
63 | def _decode_uXXXX(s, pos): |
---|
64 | esc = s[pos + 1:pos + 5] |
---|
65 | if len(esc) == 4 and esc[1] not in 'xX': |
---|
66 | try: |
---|
67 | return int(esc, 16) |
---|
68 | except ValueError: |
---|
69 | pass |
---|
70 | msg = "Invalid \\uXXXX escape" |
---|
71 | raise ValueError(errmsg(msg, s, pos)) |
---|
72 | |
---|
73 | def py_scanstring(s, end, encoding=None, strict=True, |
---|
74 | _b=BACKSLASH, _m=STRINGCHUNK.match): |
---|
75 | """Scan the string s for a JSON string. End is the index of the |
---|
76 | character in s after the quote that started the JSON string. |
---|
77 | Unescapes all valid JSON string escape sequences and raises ValueError |
---|
78 | on attempt to decode an invalid string. If strict is False then literal |
---|
79 | control characters are allowed in the string. |
---|
80 | |
---|
81 | Returns a tuple of the decoded string and the index of the character in s |
---|
82 | after the end quote.""" |
---|
83 | if encoding is None: |
---|
84 | encoding = DEFAULT_ENCODING |
---|
85 | chunks = [] |
---|
86 | _append = chunks.append |
---|
87 | begin = end - 1 |
---|
88 | while 1: |
---|
89 | chunk = _m(s, end) |
---|
90 | if chunk is None: |
---|
91 | raise ValueError( |
---|
92 | errmsg("Unterminated string starting at", s, begin)) |
---|
93 | end = chunk.end() |
---|
94 | content, terminator = chunk.groups() |
---|
95 | # Content is contains zero or more unescaped string characters |
---|
96 | if content: |
---|
97 | if not isinstance(content, unicode): |
---|
98 | content = unicode(content, encoding) |
---|
99 | _append(content) |
---|
100 | # Terminator is the end of string, a literal control character, |
---|
101 | # or a backslash denoting that an escape sequence follows |
---|
102 | if terminator == '"': |
---|
103 | break |
---|
104 | elif terminator != '\\': |
---|
105 | if strict: |
---|
106 | #msg = "Invalid control character %r at" % (terminator,) |
---|
107 | msg = "Invalid control character {0!r} at".format(terminator) |
---|
108 | raise ValueError(errmsg(msg, s, end)) |
---|
109 | else: |
---|
110 | _append(terminator) |
---|
111 | continue |
---|
112 | try: |
---|
113 | esc = s[end] |
---|
114 | except IndexError: |
---|
115 | raise ValueError( |
---|
116 | errmsg("Unterminated string starting at", s, begin)) |
---|
117 | # If not a unicode escape sequence, must be in the lookup table |
---|
118 | if esc != 'u': |
---|
119 | try: |
---|
120 | char = _b[esc] |
---|
121 | except KeyError: |
---|
122 | msg = "Invalid \\escape: " + repr(esc) |
---|
123 | raise ValueError(errmsg(msg, s, end)) |
---|
124 | end += 1 |
---|
125 | else: |
---|
126 | # Unicode escape sequence |
---|
127 | uni = _decode_uXXXX(s, end) |
---|
128 | end += 5 |
---|
129 | # Check for surrogate pair on UCS-4 systems |
---|
130 | if sys.maxunicode > 65535 and \ |
---|
131 | 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': |
---|
132 | uni2 = _decode_uXXXX(s, end + 1) |
---|
133 | if 0xdc00 <= uni2 <= 0xdfff: |
---|
134 | uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
---|
135 | end += 6 |
---|
136 | char = unichr(uni) |
---|
137 | # Append the unescaped character |
---|
138 | _append(char) |
---|
139 | return u''.join(chunks), end |
---|
140 | |
---|
141 | |
---|
142 | # Use speedup if available |
---|
143 | scanstring = c_scanstring or py_scanstring |
---|
144 | |
---|
145 | WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) |
---|
146 | WHITESPACE_STR = ' \t\n\r' |
---|
147 | |
---|
148 | def JSONObject(s_and_end, encoding, strict, scan_once, object_hook, |
---|
149 | object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
---|
150 | s, end = s_and_end |
---|
151 | pairs = [] |
---|
152 | pairs_append = pairs.append |
---|
153 | # Use a slice to prevent IndexError from being raised, the following |
---|
154 | # check will raise a more specific ValueError if the string is empty |
---|
155 | nextchar = s[end:end + 1] |
---|
156 | # Normally we expect nextchar == '"' |
---|
157 | if nextchar != '"': |
---|
158 | if nextchar in _ws: |
---|
159 | end = _w(s, end).end() |
---|
160 | nextchar = s[end:end + 1] |
---|
161 | # Trivial empty object |
---|
162 | if nextchar == '}': |
---|
163 | if object_pairs_hook is not None: |
---|
164 | result = object_pairs_hook(pairs) |
---|
165 | return result, end + 1 |
---|
166 | pairs = {} |
---|
167 | if object_hook is not None: |
---|
168 | pairs = object_hook(pairs) |
---|
169 | return pairs, end + 1 |
---|
170 | elif nextchar != '"': |
---|
171 | raise ValueError(errmsg( |
---|
172 | "Expecting property name enclosed in double quotes", s, end)) |
---|
173 | end += 1 |
---|
174 | while True: |
---|
175 | key, end = scanstring(s, end, encoding, strict) |
---|
176 | |
---|
177 | # To skip some function call overhead we optimize the fast paths where |
---|
178 | # the JSON key separator is ": " or just ":". |
---|
179 | if s[end:end + 1] != ':': |
---|
180 | end = _w(s, end).end() |
---|
181 | if s[end:end + 1] != ':': |
---|
182 | raise ValueError(errmsg("Expecting ':' delimiter", s, end)) |
---|
183 | end += 1 |
---|
184 | |
---|
185 | try: |
---|
186 | if s[end] in _ws: |
---|
187 | end += 1 |
---|
188 | if s[end] in _ws: |
---|
189 | end = _w(s, end + 1).end() |
---|
190 | except IndexError: |
---|
191 | pass |
---|
192 | |
---|
193 | try: |
---|
194 | value, end = scan_once(s, end) |
---|
195 | except StopIteration: |
---|
196 | raise ValueError(errmsg("Expecting object", s, end)) |
---|
197 | pairs_append((key, value)) |
---|
198 | |
---|
199 | try: |
---|
200 | nextchar = s[end] |
---|
201 | if nextchar in _ws: |
---|
202 | end = _w(s, end + 1).end() |
---|
203 | nextchar = s[end] |
---|
204 | except IndexError: |
---|
205 | nextchar = '' |
---|
206 | end += 1 |
---|
207 | |
---|
208 | if nextchar == '}': |
---|
209 | break |
---|
210 | elif nextchar != ',': |
---|
211 | raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1)) |
---|
212 | |
---|
213 | try: |
---|
214 | nextchar = s[end] |
---|
215 | if nextchar in _ws: |
---|
216 | end += 1 |
---|
217 | nextchar = s[end] |
---|
218 | if nextchar in _ws: |
---|
219 | end = _w(s, end + 1).end() |
---|
220 | nextchar = s[end] |
---|
221 | except IndexError: |
---|
222 | nextchar = '' |
---|
223 | |
---|
224 | end += 1 |
---|
225 | if nextchar != '"': |
---|
226 | raise ValueError(errmsg( |
---|
227 | "Expecting property name enclosed in double quotes", s, end - 1)) |
---|
228 | if object_pairs_hook is not None: |
---|
229 | result = object_pairs_hook(pairs) |
---|
230 | return result, end |
---|
231 | pairs = dict(pairs) |
---|
232 | if object_hook is not None: |
---|
233 | pairs = object_hook(pairs) |
---|
234 | return pairs, end |
---|
235 | |
---|
236 | def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
---|
237 | s, end = s_and_end |
---|
238 | values = [] |
---|
239 | nextchar = s[end:end + 1] |
---|
240 | if nextchar in _ws: |
---|
241 | end = _w(s, end + 1).end() |
---|
242 | nextchar = s[end:end + 1] |
---|
243 | # Look-ahead for trivial empty array |
---|
244 | if nextchar == ']': |
---|
245 | return values, end + 1 |
---|
246 | _append = values.append |
---|
247 | while True: |
---|
248 | try: |
---|
249 | value, end = scan_once(s, end) |
---|
250 | except StopIteration: |
---|
251 | raise ValueError(errmsg("Expecting object", s, end)) |
---|
252 | _append(value) |
---|
253 | nextchar = s[end:end + 1] |
---|
254 | if nextchar in _ws: |
---|
255 | end = _w(s, end + 1).end() |
---|
256 | nextchar = s[end:end + 1] |
---|
257 | end += 1 |
---|
258 | if nextchar == ']': |
---|
259 | break |
---|
260 | elif nextchar != ',': |
---|
261 | raise ValueError(errmsg("Expecting ',' delimiter", s, end)) |
---|
262 | try: |
---|
263 | if s[end] in _ws: |
---|
264 | end += 1 |
---|
265 | if s[end] in _ws: |
---|
266 | end = _w(s, end + 1).end() |
---|
267 | except IndexError: |
---|
268 | pass |
---|
269 | |
---|
270 | return values, end |
---|
271 | |
---|
272 | class JSONDecoder(object): |
---|
273 | """Simple JSON <http://json.org> decoder |
---|
274 | |
---|
275 | Performs the following translations in decoding by default: |
---|
276 | |
---|
277 | +---------------+-------------------+ |
---|
278 | | JSON | Python | |
---|
279 | +===============+===================+ |
---|
280 | | object | dict | |
---|
281 | +---------------+-------------------+ |
---|
282 | | array | list | |
---|
283 | +---------------+-------------------+ |
---|
284 | | string | unicode | |
---|
285 | +---------------+-------------------+ |
---|
286 | | number (int) | int, long | |
---|
287 | +---------------+-------------------+ |
---|
288 | | number (real) | float | |
---|
289 | +---------------+-------------------+ |
---|
290 | | true | True | |
---|
291 | +---------------+-------------------+ |
---|
292 | | false | False | |
---|
293 | +---------------+-------------------+ |
---|
294 | | null | None | |
---|
295 | +---------------+-------------------+ |
---|
296 | |
---|
297 | It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as |
---|
298 | their corresponding ``float`` values, which is outside the JSON spec. |
---|
299 | |
---|
300 | """ |
---|
301 | |
---|
302 | def __init__(self, encoding=None, object_hook=None, parse_float=None, |
---|
303 | parse_int=None, parse_constant=None, strict=True, |
---|
304 | object_pairs_hook=None): |
---|
305 | """``encoding`` determines the encoding used to interpret any ``str`` |
---|
306 | objects decoded by this instance (utf-8 by default). It has no |
---|
307 | effect when decoding ``unicode`` objects. |
---|
308 | |
---|
309 | Note that currently only encodings that are a superset of ASCII work, |
---|
310 | strings of other encodings should be passed in as ``unicode``. |
---|
311 | |
---|
312 | ``object_hook``, if specified, will be called with the result |
---|
313 | of every JSON object decoded and its return value will be used in |
---|
314 | place of the given ``dict``. This can be used to provide custom |
---|
315 | deserializations (e.g. to support JSON-RPC class hinting). |
---|
316 | |
---|
317 | ``object_pairs_hook``, if specified will be called with the result of |
---|
318 | every JSON object decoded with an ordered list of pairs. The return |
---|
319 | value of ``object_pairs_hook`` will be used instead of the ``dict``. |
---|
320 | This feature can be used to implement custom decoders that rely on the |
---|
321 | order that the key and value pairs are decoded (for example, |
---|
322 | collections.OrderedDict will remember the order of insertion). If |
---|
323 | ``object_hook`` is also defined, the ``object_pairs_hook`` takes |
---|
324 | priority. |
---|
325 | |
---|
326 | ``parse_float``, if specified, will be called with the string |
---|
327 | of every JSON float to be decoded. By default this is equivalent to |
---|
328 | float(num_str). This can be used to use another datatype or parser |
---|
329 | for JSON floats (e.g. decimal.Decimal). |
---|
330 | |
---|
331 | ``parse_int``, if specified, will be called with the string |
---|
332 | of every JSON int to be decoded. By default this is equivalent to |
---|
333 | int(num_str). This can be used to use another datatype or parser |
---|
334 | for JSON integers (e.g. float). |
---|
335 | |
---|
336 | ``parse_constant``, if specified, will be called with one of the |
---|
337 | following strings: -Infinity, Infinity, NaN. |
---|
338 | This can be used to raise an exception if invalid JSON numbers |
---|
339 | are encountered. |
---|
340 | |
---|
341 | If ``strict`` is false (true is the default), then control |
---|
342 | characters will be allowed inside strings. Control characters in |
---|
343 | this context are those with character codes in the 0-31 range, |
---|
344 | including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``. |
---|
345 | |
---|
346 | """ |
---|
347 | self.encoding = encoding |
---|
348 | self.object_hook = object_hook |
---|
349 | self.object_pairs_hook = object_pairs_hook |
---|
350 | self.parse_float = parse_float or float |
---|
351 | self.parse_int = parse_int or int |
---|
352 | self.parse_constant = parse_constant or _CONSTANTS.__getitem__ |
---|
353 | self.strict = strict |
---|
354 | self.parse_object = JSONObject |
---|
355 | self.parse_array = JSONArray |
---|
356 | self.parse_string = scanstring |
---|
357 | self.scan_once = scanner.make_scanner(self) |
---|
358 | |
---|
359 | def decode(self, s, _w=WHITESPACE.match): |
---|
360 | """Return the Python representation of ``s`` (a ``str`` or ``unicode`` |
---|
361 | instance containing a JSON document) |
---|
362 | |
---|
363 | """ |
---|
364 | obj, end = self.raw_decode(s, idx=_w(s, 0).end()) |
---|
365 | end = _w(s, end).end() |
---|
366 | if end != len(s): |
---|
367 | raise ValueError(errmsg("Extra data", s, end, len(s))) |
---|
368 | return obj |
---|
369 | |
---|
370 | def raw_decode(self, s, idx=0): |
---|
371 | """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` |
---|
372 | beginning with a JSON document) and return a 2-tuple of the Python |
---|
373 | representation and the index in ``s`` where the document ended. |
---|
374 | |
---|
375 | This can be used to decode a JSON document from a string that may |
---|
376 | have extraneous data at the end. |
---|
377 | |
---|
378 | """ |
---|
379 | try: |
---|
380 | obj, end = self.scan_once(s, idx) |
---|
381 | except StopIteration: |
---|
382 | raise ValueError("No JSON object could be decoded") |
---|
383 | return obj, end |
---|