1 | ######################## BEGIN LICENSE BLOCK ######################## |
---|
2 | # The Original Code is Mozilla Universal charset detector code. |
---|
3 | # |
---|
4 | # The Initial Developer of the Original Code is |
---|
5 | # Netscape Communications Corporation. |
---|
6 | # Portions created by the Initial Developer are Copyright (C) 2001 |
---|
7 | # the Initial Developer. All Rights Reserved. |
---|
8 | # |
---|
9 | # Contributor(s): |
---|
10 | # Mark Pilgrim - port to Python |
---|
11 | # Shy Shalom - original C code |
---|
12 | # |
---|
13 | # This library is free software; you can redistribute it and/or |
---|
14 | # modify it under the terms of the GNU Lesser General Public |
---|
15 | # License as published by the Free Software Foundation; either |
---|
16 | # version 2.1 of the License, or (at your option) any later version. |
---|
17 | # |
---|
18 | # This library is distributed in the hope that it will be useful, |
---|
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
21 | # Lesser General Public License for more details. |
---|
22 | # |
---|
23 | # You should have received a copy of the GNU Lesser General Public |
---|
24 | # License along with this library; if not, write to the Free Software |
---|
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
---|
26 | # 02110-1301 USA |
---|
27 | ######################### END LICENSE BLOCK ######################### |
---|
28 | """ |
---|
29 | Module containing the UniversalDetector detector class, which is the primary |
---|
30 | class a user of ``chardet`` should use. |
---|
31 | |
---|
32 | :author: Mark Pilgrim (initial port to Python) |
---|
33 | :author: Shy Shalom (original C code) |
---|
34 | :author: Dan Blanchard (major refactoring for 3.0) |
---|
35 | :author: Ian Cordasco |
---|
36 | """ |
---|
37 | |
---|
38 | |
---|
39 | import codecs |
---|
40 | import logging |
---|
41 | import re |
---|
42 | |
---|
43 | from .charsetgroupprober import CharSetGroupProber |
---|
44 | from .enums import InputState, LanguageFilter, ProbingState |
---|
45 | from .escprober import EscCharSetProber |
---|
46 | from .latin1prober import Latin1Prober |
---|
47 | from .mbcsgroupprober import MBCSGroupProber |
---|
48 | from .sbcsgroupprober import SBCSGroupProber |
---|
49 | |
---|
50 | |
---|
51 | class UniversalDetector(object): |
---|
52 | """ |
---|
53 | The ``UniversalDetector`` class underlies the ``chardet.detect`` function |
---|
54 | and coordinates all of the different charset probers. |
---|
55 | |
---|
56 | To get a ``dict`` containing an encoding and its confidence, you can simply |
---|
57 | run: |
---|
58 | |
---|
59 | .. code:: |
---|
60 | |
---|
61 | u = UniversalDetector() |
---|
62 | u.feed(some_bytes) |
---|
63 | u.close() |
---|
64 | detected = u.result |
---|
65 | |
---|
66 | """ |
---|
67 | |
---|
68 | MINIMUM_THRESHOLD = 0.20 |
---|
69 | HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') |
---|
70 | ESC_DETECTOR = re.compile(b'(\033|~{)') |
---|
71 | WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') |
---|
72 | ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', |
---|
73 | 'iso-8859-2': 'Windows-1250', |
---|
74 | 'iso-8859-5': 'Windows-1251', |
---|
75 | 'iso-8859-6': 'Windows-1256', |
---|
76 | 'iso-8859-7': 'Windows-1253', |
---|
77 | 'iso-8859-8': 'Windows-1255', |
---|
78 | 'iso-8859-9': 'Windows-1254', |
---|
79 | 'iso-8859-13': 'Windows-1257'} |
---|
80 | |
---|
81 | def __init__(self, lang_filter=LanguageFilter.ALL): |
---|
82 | self._esc_charset_prober = None |
---|
83 | self._charset_probers = [] |
---|
84 | self.result = None |
---|
85 | self.done = None |
---|
86 | self._got_data = None |
---|
87 | self._input_state = None |
---|
88 | self._last_char = None |
---|
89 | self.lang_filter = lang_filter |
---|
90 | self.logger = logging.getLogger(__name__) |
---|
91 | self._has_win_bytes = None |
---|
92 | self.reset() |
---|
93 | |
---|
94 | def reset(self): |
---|
95 | """ |
---|
96 | Reset the UniversalDetector and all of its probers back to their |
---|
97 | initial states. This is called by ``__init__``, so you only need to |
---|
98 | call this directly in between analyses of different documents. |
---|
99 | """ |
---|
100 | self.result = {'encoding': None, 'confidence': 0.0, 'language': None} |
---|
101 | self.done = False |
---|
102 | self._got_data = False |
---|
103 | self._has_win_bytes = False |
---|
104 | self._input_state = InputState.PURE_ASCII |
---|
105 | self._last_char = b'' |
---|
106 | if self._esc_charset_prober: |
---|
107 | self._esc_charset_prober.reset() |
---|
108 | for prober in self._charset_probers: |
---|
109 | prober.reset() |
---|
110 | |
---|
111 | def feed(self, byte_str): |
---|
112 | """ |
---|
113 | Takes a chunk of a document and feeds it through all of the relevant |
---|
114 | charset probers. |
---|
115 | |
---|
116 | After calling ``feed``, you can check the value of the ``done`` |
---|
117 | attribute to see if you need to continue feeding the |
---|
118 | ``UniversalDetector`` more data, or if it has made a prediction |
---|
119 | (in the ``result`` attribute). |
---|
120 | |
---|
121 | .. note:: |
---|
122 | You should always call ``close`` when you're done feeding in your |
---|
123 | document if ``done`` is not already ``True``. |
---|
124 | """ |
---|
125 | if self.done: |
---|
126 | return |
---|
127 | |
---|
128 | if not len(byte_str): |
---|
129 | return |
---|
130 | |
---|
131 | if not isinstance(byte_str, bytearray): |
---|
132 | byte_str = bytearray(byte_str) |
---|
133 | |
---|
134 | # First check for known BOMs, since these are guaranteed to be correct |
---|
135 | if not self._got_data: |
---|
136 | # If the data starts with BOM, we know it is UTF |
---|
137 | if byte_str.startswith(codecs.BOM_UTF8): |
---|
138 | # EF BB BF UTF-8 with BOM |
---|
139 | self.result = {'encoding': "UTF-8-SIG", |
---|
140 | 'confidence': 1.0, |
---|
141 | 'language': ''} |
---|
142 | elif byte_str.startswith((codecs.BOM_UTF32_LE, |
---|
143 | codecs.BOM_UTF32_BE)): |
---|
144 | # FF FE 00 00 UTF-32, little-endian BOM |
---|
145 | # 00 00 FE FF UTF-32, big-endian BOM |
---|
146 | self.result = {'encoding': "UTF-32", |
---|
147 | 'confidence': 1.0, |
---|
148 | 'language': ''} |
---|
149 | elif byte_str.startswith(b'\xFE\xFF\x00\x00'): |
---|
150 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) |
---|
151 | self.result = {'encoding': "X-ISO-10646-UCS-4-3412", |
---|
152 | 'confidence': 1.0, |
---|
153 | 'language': ''} |
---|
154 | elif byte_str.startswith(b'\x00\x00\xFF\xFE'): |
---|
155 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) |
---|
156 | self.result = {'encoding': "X-ISO-10646-UCS-4-2143", |
---|
157 | 'confidence': 1.0, |
---|
158 | 'language': ''} |
---|
159 | elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): |
---|
160 | # FF FE UTF-16, little endian BOM |
---|
161 | # FE FF UTF-16, big endian BOM |
---|
162 | self.result = {'encoding': "UTF-16", |
---|
163 | 'confidence': 1.0, |
---|
164 | 'language': ''} |
---|
165 | |
---|
166 | self._got_data = True |
---|
167 | if self.result['encoding'] is not None: |
---|
168 | self.done = True |
---|
169 | return |
---|
170 | |
---|
171 | # If none of those matched and we've only see ASCII so far, check |
---|
172 | # for high bytes and escape sequences |
---|
173 | if self._input_state == InputState.PURE_ASCII: |
---|
174 | if self.HIGH_BYTE_DETECTOR.search(byte_str): |
---|
175 | self._input_state = InputState.HIGH_BYTE |
---|
176 | elif self._input_state == InputState.PURE_ASCII and \ |
---|
177 | self.ESC_DETECTOR.search(self._last_char + byte_str): |
---|
178 | self._input_state = InputState.ESC_ASCII |
---|
179 | |
---|
180 | self._last_char = byte_str[-1:] |
---|
181 | |
---|
182 | # If we've seen escape sequences, use the EscCharSetProber, which |
---|
183 | # uses a simple state machine to check for known escape sequences in |
---|
184 | # HZ and ISO-2022 encodings, since those are the only encodings that |
---|
185 | # use such sequences. |
---|
186 | if self._input_state == InputState.ESC_ASCII: |
---|
187 | if not self._esc_charset_prober: |
---|
188 | self._esc_charset_prober = EscCharSetProber(self.lang_filter) |
---|
189 | if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: |
---|
190 | self.result = {'encoding': |
---|
191 | self._esc_charset_prober.charset_name, |
---|
192 | 'confidence': |
---|
193 | self._esc_charset_prober.get_confidence(), |
---|
194 | 'language': |
---|
195 | self._esc_charset_prober.language} |
---|
196 | self.done = True |
---|
197 | # If we've seen high bytes (i.e., those with values greater than 127), |
---|
198 | # we need to do more complicated checks using all our multi-byte and |
---|
199 | # single-byte probers that are left. The single-byte probers |
---|
200 | # use character bigram distributions to determine the encoding, whereas |
---|
201 | # the multi-byte probers use a combination of character unigram and |
---|
202 | # bigram distributions. |
---|
203 | elif self._input_state == InputState.HIGH_BYTE: |
---|
204 | if not self._charset_probers: |
---|
205 | self._charset_probers = [MBCSGroupProber(self.lang_filter)] |
---|
206 | # If we're checking non-CJK encodings, use single-byte prober |
---|
207 | if self.lang_filter & LanguageFilter.NON_CJK: |
---|
208 | self._charset_probers.append(SBCSGroupProber()) |
---|
209 | self._charset_probers.append(Latin1Prober()) |
---|
210 | for prober in self._charset_probers: |
---|
211 | if prober.feed(byte_str) == ProbingState.FOUND_IT: |
---|
212 | self.result = {'encoding': prober.charset_name, |
---|
213 | 'confidence': prober.get_confidence(), |
---|
214 | 'language': prober.language} |
---|
215 | self.done = True |
---|
216 | break |
---|
217 | if self.WIN_BYTE_DETECTOR.search(byte_str): |
---|
218 | self._has_win_bytes = True |
---|
219 | |
---|
220 | def close(self): |
---|
221 | """ |
---|
222 | Stop analyzing the current document and come up with a final |
---|
223 | prediction. |
---|
224 | |
---|
225 | :returns: The ``result`` attribute, a ``dict`` with the keys |
---|
226 | `encoding`, `confidence`, and `language`. |
---|
227 | """ |
---|
228 | # Don't bother with checks if we're already done |
---|
229 | if self.done: |
---|
230 | return self.result |
---|
231 | self.done = True |
---|
232 | |
---|
233 | if not self._got_data: |
---|
234 | self.logger.debug('no data received!') |
---|
235 | |
---|
236 | # Default to ASCII if it is all we've seen so far |
---|
237 | elif self._input_state == InputState.PURE_ASCII: |
---|
238 | self.result = {'encoding': 'ascii', |
---|
239 | 'confidence': 1.0, |
---|
240 | 'language': ''} |
---|
241 | |
---|
242 | # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD |
---|
243 | elif self._input_state == InputState.HIGH_BYTE: |
---|
244 | prober_confidence = None |
---|
245 | max_prober_confidence = 0.0 |
---|
246 | max_prober = None |
---|
247 | for prober in self._charset_probers: |
---|
248 | if not prober: |
---|
249 | continue |
---|
250 | prober_confidence = prober.get_confidence() |
---|
251 | if prober_confidence > max_prober_confidence: |
---|
252 | max_prober_confidence = prober_confidence |
---|
253 | max_prober = prober |
---|
254 | if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): |
---|
255 | charset_name = max_prober.charset_name |
---|
256 | lower_charset_name = max_prober.charset_name.lower() |
---|
257 | confidence = max_prober.get_confidence() |
---|
258 | # Use Windows encoding name instead of ISO-8859 if we saw any |
---|
259 | # extra Windows-specific bytes |
---|
260 | if lower_charset_name.startswith('iso-8859'): |
---|
261 | if self._has_win_bytes: |
---|
262 | charset_name = self.ISO_WIN_MAP.get(lower_charset_name, |
---|
263 | charset_name) |
---|
264 | self.result = {'encoding': charset_name, |
---|
265 | 'confidence': confidence, |
---|
266 | 'language': max_prober.language} |
---|
267 | |
---|
268 | # Log all prober confidences if none met MINIMUM_THRESHOLD |
---|
269 | if self.logger.getEffectiveLevel() == logging.DEBUG: |
---|
270 | if self.result['encoding'] is None: |
---|
271 | self.logger.debug('no probers hit minimum threshold') |
---|
272 | for group_prober in self._charset_probers: |
---|
273 | if not group_prober: |
---|
274 | continue |
---|
275 | if isinstance(group_prober, CharSetGroupProber): |
---|
276 | for prober in group_prober.probers: |
---|
277 | self.logger.debug('%s %s confidence = %s', |
---|
278 | prober.charset_name, |
---|
279 | prober.language, |
---|
280 | prober.get_confidence()) |
---|
281 | else: |
---|
282 | self.logger.debug('%s %s confidence = %s', |
---|
283 | prober.charset_name, |
---|
284 | prober.language, |
---|
285 | prober.get_confidence()) |
---|
286 | return self.result |
---|