1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 ######################### END LICENSE BLOCK #########################
29 Module containing the UniversalDetector detector class, which is the primary
30 class a user of ``chardet`` should use.
32 :author: Mark Pilgrim (initial port to Python)
33 :author: Shy Shalom (original C code)
34 :author: Dan Blanchard (major refactoring for 3.0)
42 from typing import List, Optional, Union
44 from .charsetgroupprober import CharSetGroupProber
45 from .charsetprober import CharSetProber
46 from .enums import InputState, LanguageFilter, ProbingState
47 from .escprober import EscCharSetProber
48 from .latin1prober import Latin1Prober
49 from .macromanprober import MacRomanProber
50 from .mbcsgroupprober import MBCSGroupProber
51 from .resultdict import ResultDict
52 from .sbcsgroupprober import SBCSGroupProber
53 from .utf1632prober import UTF1632Prober
56 class UniversalDetector:
58 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
59 and coordinates all of the different charset probers.
61 To get a ``dict`` containing an encoding and its confidence, you can simply
66 u = UniversalDetector()
73 MINIMUM_THRESHOLD = 0.20
74 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
75 ESC_DETECTOR = re.compile(b"(\033|~{)")
76 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
78 "iso-8859-1": "Windows-1252",
79 "iso-8859-2": "Windows-1250",
80 "iso-8859-5": "Windows-1251",
81 "iso-8859-6": "Windows-1256",
82 "iso-8859-7": "Windows-1253",
83 "iso-8859-8": "Windows-1255",
84 "iso-8859-9": "Windows-1254",
85 "iso-8859-13": "Windows-1257",
87 # Based on https://encoding.spec.whatwg.org/#names-and-labels
88 # but altered to match Python names for encodings and remove mappings
91 "ascii": "Windows-1252",
92 "iso-8859-1": "Windows-1252",
93 "tis-620": "ISO-8859-11",
94 "iso-8859-9": "Windows-1254",
102 lang_filter: LanguageFilter = LanguageFilter.ALL,
103 should_rename_legacy: bool = False,
105 self._esc_charset_prober: Optional[EscCharSetProber] = None
106 self._utf1632_prober: Optional[UTF1632Prober] = None
107 self._charset_probers: List[CharSetProber] = []
108 self.result: ResultDict = {
114 self._got_data = False
115 self._input_state = InputState.PURE_ASCII
116 self._last_char = b""
117 self.lang_filter = lang_filter
118 self.logger = logging.getLogger(__name__)
119 self._has_win_bytes = False
120 self.should_rename_legacy = should_rename_legacy
124 def input_state(self) -> int:
125 return self._input_state
128 def has_win_bytes(self) -> bool:
129 return self._has_win_bytes
132 def charset_probers(self) -> List[CharSetProber]:
133 return self._charset_probers
135 def reset(self) -> None:
137 Reset the UniversalDetector and all of its probers back to their
138 initial states. This is called by ``__init__``, so you only need to
139 call this directly in between analyses of different documents.
141 self.result = {"encoding": None, "confidence": 0.0, "language": None}
143 self._got_data = False
144 self._has_win_bytes = False
145 self._input_state = InputState.PURE_ASCII
146 self._last_char = b""
147 if self._esc_charset_prober:
148 self._esc_charset_prober.reset()
149 if self._utf1632_prober:
150 self._utf1632_prober.reset()
151 for prober in self._charset_probers:
154 def feed(self, byte_str: Union[bytes, bytearray]) -> None:
156 Takes a chunk of a document and feeds it through all of the relevant
159 After calling ``feed``, you can check the value of the ``done``
160 attribute to see if you need to continue feeding the
161 ``UniversalDetector`` more data, or if it has made a prediction
162 (in the ``result`` attribute).
165 You should always call ``close`` when you're done feeding in your
166 document if ``done`` is not already ``True``.
174 if not isinstance(byte_str, bytearray):
175 byte_str = bytearray(byte_str)
177 # First check for known BOMs, since these are guaranteed to be correct
178 if not self._got_data:
179 # If the data starts with BOM, we know it is UTF
180 if byte_str.startswith(codecs.BOM_UTF8):
181 # EF BB BF UTF-8 with BOM
183 "encoding": "UTF-8-SIG",
187 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
188 # FF FE 00 00 UTF-32, little-endian BOM
189 # 00 00 FE FF UTF-32, big-endian BOM
190 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
191 elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
192 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
194 # TODO: This encoding is not supported by Python. Should remove?
195 "encoding": "X-ISO-10646-UCS-4-3412",
199 elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
200 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
202 # TODO: This encoding is not supported by Python. Should remove?
203 "encoding": "X-ISO-10646-UCS-4-2143",
207 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
208 # FF FE UTF-16, little endian BOM
209 # FE FF UTF-16, big endian BOM
210 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
212 self._got_data = True
213 if self.result["encoding"] is not None:
217 # If none of those matched and we've only see ASCII so far, check
218 # for high bytes and escape sequences
219 if self._input_state == InputState.PURE_ASCII:
220 if self.HIGH_BYTE_DETECTOR.search(byte_str):
221 self._input_state = InputState.HIGH_BYTE
223 self._input_state == InputState.PURE_ASCII
224 and self.ESC_DETECTOR.search(self._last_char + byte_str)
226 self._input_state = InputState.ESC_ASCII
228 self._last_char = byte_str[-1:]
230 # next we will look to see if it is appears to be either a UTF-16 or
232 if not self._utf1632_prober:
233 self._utf1632_prober = UTF1632Prober()
235 if self._utf1632_prober.state == ProbingState.DETECTING:
236 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
238 "encoding": self._utf1632_prober.charset_name,
239 "confidence": self._utf1632_prober.get_confidence(),
245 # If we've seen escape sequences, use the EscCharSetProber, which
246 # uses a simple state machine to check for known escape sequences in
247 # HZ and ISO-2022 encodings, since those are the only encodings that
248 # use such sequences.
249 if self._input_state == InputState.ESC_ASCII:
250 if not self._esc_charset_prober:
251 self._esc_charset_prober = EscCharSetProber(self.lang_filter)
252 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
254 "encoding": self._esc_charset_prober.charset_name,
255 "confidence": self._esc_charset_prober.get_confidence(),
256 "language": self._esc_charset_prober.language,
259 # If we've seen high bytes (i.e., those with values greater than 127),
260 # we need to do more complicated checks using all our multi-byte and
261 # single-byte probers that are left. The single-byte probers
262 # use character bigram distributions to determine the encoding, whereas
263 # the multi-byte probers use a combination of character unigram and
264 # bigram distributions.
265 elif self._input_state == InputState.HIGH_BYTE:
266 if not self._charset_probers:
267 self._charset_probers = [MBCSGroupProber(self.lang_filter)]
268 # If we're checking non-CJK encodings, use single-byte prober
269 if self.lang_filter & LanguageFilter.NON_CJK:
270 self._charset_probers.append(SBCSGroupProber())
271 self._charset_probers.append(Latin1Prober())
272 self._charset_probers.append(MacRomanProber())
273 for prober in self._charset_probers:
274 if prober.feed(byte_str) == ProbingState.FOUND_IT:
276 "encoding": prober.charset_name,
277 "confidence": prober.get_confidence(),
278 "language": prober.language,
282 if self.WIN_BYTE_DETECTOR.search(byte_str):
283 self._has_win_bytes = True
285 def close(self) -> ResultDict:
287 Stop analyzing the current document and come up with a final
290 :returns: The ``result`` attribute, a ``dict`` with the keys
291 `encoding`, `confidence`, and `language`.
293 # Don't bother with checks if we're already done
298 if not self._got_data:
299 self.logger.debug("no data received!")
301 # Default to ASCII if it is all we've seen so far
302 elif self._input_state == InputState.PURE_ASCII:
303 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
305 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
306 elif self._input_state == InputState.HIGH_BYTE:
307 prober_confidence = None
308 max_prober_confidence = 0.0
310 for prober in self._charset_probers:
313 prober_confidence = prober.get_confidence()
314 if prober_confidence > max_prober_confidence:
315 max_prober_confidence = prober_confidence
317 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
318 charset_name = max_prober.charset_name
319 assert charset_name is not None
320 lower_charset_name = charset_name.lower()
321 confidence = max_prober.get_confidence()
322 # Use Windows encoding name instead of ISO-8859 if we saw any
323 # extra Windows-specific bytes
324 if lower_charset_name.startswith("iso-8859"):
325 if self._has_win_bytes:
326 charset_name = self.ISO_WIN_MAP.get(
327 lower_charset_name, charset_name
329 # Rename legacy encodings with superset encodings if asked
330 if self.should_rename_legacy:
331 charset_name = self.LEGACY_MAP.get(
332 (charset_name or "").lower(), charset_name
335 "encoding": charset_name,
336 "confidence": confidence,
337 "language": max_prober.language,
340 # Log all prober confidences if none met MINIMUM_THRESHOLD
341 if self.logger.getEffectiveLevel() <= logging.DEBUG:
342 if self.result["encoding"] is None:
343 self.logger.debug("no probers hit minimum threshold")
344 for group_prober in self._charset_probers:
347 if isinstance(group_prober, CharSetGroupProber):
348 for prober in group_prober.probers:
350 "%s %s confidence = %s",
353 prober.get_confidence(),
357 "%s %s confidence = %s",
358 group_prober.charset_name,
359 group_prober.language,
360 group_prober.get_confidence(),