1 | ######################## BEGIN LICENSE BLOCK ######################## |
---|
2 | # The Original Code is Mozilla Universal charset detector code. |
---|
3 | # |
---|
4 | # The Initial Developer of the Original Code is |
---|
5 | # Netscape Communications Corporation. |
---|
6 | # Portions created by the Initial Developer are Copyright (C) 2001 |
---|
7 | # the Initial Developer. All Rights Reserved. |
---|
8 | # |
---|
9 | # Contributor(s): |
---|
10 | # Mark Pilgrim - port to Python |
---|
11 | # Shy Shalom - original C code |
---|
12 | # |
---|
13 | # This library is free software; you can redistribute it and/or |
---|
14 | # modify it under the terms of the GNU Lesser General Public |
---|
15 | # License as published by the Free Software Foundation; either |
---|
16 | # version 2.1 of the License, or (at your option) any later version. |
---|
17 | # |
---|
18 | # This library is distributed in the hope that it will be useful, |
---|
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
21 | # Lesser General Public License for more details. |
---|
22 | # |
---|
23 | # You should have received a copy of the GNU Lesser General Public |
---|
24 | # License along with this library; if not, write to the Free Software |
---|
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
---|
26 | # 02110-1301 USA |
---|
27 | ######################### END LICENSE BLOCK ######################### |
---|
28 | |
---|
29 | from .charsetgroupprober import CharSetGroupProber |
---|
30 | from .sbcharsetprober import SingleByteCharSetProber |
---|
31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, |
---|
32 | Latin5CyrillicModel, MacCyrillicModel, |
---|
33 | Ibm866Model, Ibm855Model) |
---|
34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel |
---|
35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel |
---|
36 | # from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel |
---|
37 | from .langthaimodel import TIS620ThaiModel |
---|
38 | from .langhebrewmodel import Win1255HebrewModel |
---|
39 | from .hebrewprober import HebrewProber |
---|
40 | from .langturkishmodel import Latin5TurkishModel |
---|
41 | |
---|
42 | |
---|
43 | class SBCSGroupProber(CharSetGroupProber): |
---|
44 | def __init__(self): |
---|
45 | super(SBCSGroupProber, self).__init__() |
---|
46 | self.probers = [ |
---|
47 | SingleByteCharSetProber(Win1251CyrillicModel), |
---|
48 | SingleByteCharSetProber(Koi8rModel), |
---|
49 | SingleByteCharSetProber(Latin5CyrillicModel), |
---|
50 | SingleByteCharSetProber(MacCyrillicModel), |
---|
51 | SingleByteCharSetProber(Ibm866Model), |
---|
52 | SingleByteCharSetProber(Ibm855Model), |
---|
53 | SingleByteCharSetProber(Latin7GreekModel), |
---|
54 | SingleByteCharSetProber(Win1253GreekModel), |
---|
55 | SingleByteCharSetProber(Latin5BulgarianModel), |
---|
56 | SingleByteCharSetProber(Win1251BulgarianModel), |
---|
57 | # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) |
---|
58 | # after we retrain model. |
---|
59 | # SingleByteCharSetProber(Latin2HungarianModel), |
---|
60 | # SingleByteCharSetProber(Win1250HungarianModel), |
---|
61 | SingleByteCharSetProber(TIS620ThaiModel), |
---|
62 | SingleByteCharSetProber(Latin5TurkishModel), |
---|
63 | ] |
---|
64 | hebrew_prober = HebrewProber() |
---|
65 | logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, |
---|
66 | False, hebrew_prober) |
---|
67 | visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True, |
---|
68 | hebrew_prober) |
---|
69 | hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) |
---|
70 | self.probers.extend([hebrew_prober, logical_hebrew_prober, |
---|
71 | visual_hebrew_prober]) |
---|
72 | |
---|
73 | self.reset() |
---|