1 | ######################## BEGIN LICENSE BLOCK ######################## |
---|
2 | # The Original Code is Mozilla Universal charset detector code. |
---|
3 | # |
---|
4 | # The Initial Developer of the Original Code is |
---|
5 | # Netscape Communications Corporation. |
---|
6 | # Portions created by the Initial Developer are Copyright (C) 2001 |
---|
7 | # the Initial Developer. All Rights Reserved. |
---|
8 | # |
---|
9 | # Contributor(s): |
---|
10 | # Mark Pilgrim - port to Python |
---|
11 | # Shy Shalom - original C code |
---|
12 | # |
---|
13 | # This library is free software; you can redistribute it and/or |
---|
14 | # modify it under the terms of the GNU Lesser General Public |
---|
15 | # License as published by the Free Software Foundation; either |
---|
16 | # version 2.1 of the License, or (at your option) any later version. |
---|
17 | # |
---|
18 | # This library is distributed in the hope that it will be useful, |
---|
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
21 | # Lesser General Public License for more details. |
---|
22 | # |
---|
23 | # You should have received a copy of the GNU Lesser General Public |
---|
24 | # License along with this library; if not, write to the Free Software |
---|
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
---|
26 | # 02110-1301 USA |
---|
27 | ######################### END LICENSE BLOCK ######################### |
---|
28 | |
---|
29 | from .charsetprober import CharSetProber |
---|
30 | from .enums import ProbingState |
---|
31 | |
---|
32 | FREQ_CAT_NUM = 4 |
---|
33 | |
---|
34 | UDF = 0 # undefined |
---|
35 | OTH = 1 # other |
---|
36 | ASC = 2 # ascii capital letter |
---|
37 | ASS = 3 # ascii small letter |
---|
38 | ACV = 4 # accent capital vowel |
---|
39 | ACO = 5 # accent capital other |
---|
40 | ASV = 6 # accent small vowel |
---|
41 | ASO = 7 # accent small other |
---|
42 | CLASS_NUM = 8 # total classes |
---|
43 | |
---|
44 | Latin1_CharToClass = ( |
---|
45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 |
---|
46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F |
---|
47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 |
---|
48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F |
---|
49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 |
---|
50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F |
---|
51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 |
---|
52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F |
---|
53 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 |
---|
54 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F |
---|
55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 |
---|
56 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F |
---|
57 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 |
---|
58 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F |
---|
59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 |
---|
60 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F |
---|
61 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 |
---|
62 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F |
---|
63 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 |
---|
64 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F |
---|
65 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 |
---|
66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF |
---|
67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 |
---|
68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF |
---|
69 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 |
---|
70 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF |
---|
71 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 |
---|
72 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF |
---|
73 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 |
---|
74 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF |
---|
75 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 |
---|
76 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF |
---|
77 | ) |
---|
78 | |
---|
79 | # 0 : illegal |
---|
80 | # 1 : very unlikely |
---|
81 | # 2 : normal |
---|
82 | # 3 : very likely |
---|
83 | Latin1ClassModel = ( |
---|
84 | # UDF OTH ASC ASS ACV ACO ASV ASO |
---|
85 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF |
---|
86 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH |
---|
87 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC |
---|
88 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS |
---|
89 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV |
---|
90 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO |
---|
91 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV |
---|
92 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO |
---|
93 | ) |
---|
94 | |
---|
95 | |
---|
96 | class Latin1Prober(CharSetProber): |
---|
97 | def __init__(self): |
---|
98 | super(Latin1Prober, self).__init__() |
---|
99 | self._last_char_class = None |
---|
100 | self._freq_counter = None |
---|
101 | self.reset() |
---|
102 | |
---|
103 | def reset(self): |
---|
104 | self._last_char_class = OTH |
---|
105 | self._freq_counter = [0] * FREQ_CAT_NUM |
---|
106 | CharSetProber.reset(self) |
---|
107 | |
---|
108 | @property |
---|
109 | def charset_name(self): |
---|
110 | return "ISO-8859-1" |
---|
111 | |
---|
112 | @property |
---|
113 | def language(self): |
---|
114 | return "" |
---|
115 | |
---|
116 | def feed(self, byte_str): |
---|
117 | byte_str = self.filter_with_english_letters(byte_str) |
---|
118 | for c in byte_str: |
---|
119 | char_class = Latin1_CharToClass[c] |
---|
120 | freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) |
---|
121 | + char_class] |
---|
122 | if freq == 0: |
---|
123 | self._state = ProbingState.NOT_ME |
---|
124 | break |
---|
125 | self._freq_counter[freq] += 1 |
---|
126 | self._last_char_class = char_class |
---|
127 | |
---|
128 | return self.state |
---|
129 | |
---|
130 | def get_confidence(self): |
---|
131 | if self.state == ProbingState.NOT_ME: |
---|
132 | return 0.01 |
---|
133 | |
---|
134 | total = sum(self._freq_counter) |
---|
135 | if total < 0.01: |
---|
136 | confidence = 0.0 |
---|
137 | else: |
---|
138 | confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0) |
---|
139 | / total) |
---|
140 | if confidence < 0.0: |
---|
141 | confidence = 0.0 |
---|
142 | # lower the confidence of latin1 so that other more accurate |
---|
143 | # detector can take priority. |
---|
144 | confidence = confidence * 0.73 |
---|
145 | return confidence |
---|