1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10#   Mark Pilgrim - port to Python
11#   Shy Shalom - original C code
12#   Proofpoint, Inc.
13#
14# This library is free software; you can redistribute it and/or
15# modify it under the terms of the GNU Lesser General Public
16# License as published by the Free Software Foundation; either
17# version 2.1 of the License, or (at your option) any later version.
18#
19# This library is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22# Lesser General Public License for more details.
23#
24# You should have received a copy of the GNU Lesser General Public
25# License along with this library; if not, write to the Free Software
26# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27# 02110-1301  USA
28######################### END LICENSE BLOCK #########################
29
30import sys
31from . import constants
32from .charsetprober import CharSetProber
33
34
35class MultiByteCharSetProber(CharSetProber):
36    def __init__(self):
37        CharSetProber.__init__(self)
38        self._mDistributionAnalyzer = None
39        self._mCodingSM = None
40        self._mLastChar = [0, 0]
41
42    def reset(self):
43        CharSetProber.reset(self)
44        if self._mCodingSM:
45            self._mCodingSM.reset()
46        if self._mDistributionAnalyzer:
47            self._mDistributionAnalyzer.reset()
48        self._mLastChar = [0, 0]
49
50    def get_charset_name(self):
51        pass
52
53    def feed(self, aBuf):
54        aLen = len(aBuf)
55        for i in range(0, aLen):
56            codingState = self._mCodingSM.next_state(aBuf[i])
57            if codingState == constants.eError:
58                if constants._debug:
59                    sys.stderr.write(self.get_charset_name()
60                                     + ' prober hit error at byte ' + str(i)
61                                     + '\n')
62                self._mState = constants.eNotMe
63                break
64            elif codingState == constants.eItsMe:
65                self._mState = constants.eFoundIt
66                break
67            elif codingState == constants.eStart:
68                charLen = self._mCodingSM.get_current_charlen()
69                if i == 0:
70                    self._mLastChar[1] = aBuf[0]
71                    self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
72                else:
73                    self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
74                                                     charLen)
75
76        self._mLastChar[0] = aBuf[aLen - 1]
77
78        if self.get_state() == constants.eDetecting:
79            if (self._mDistributionAnalyzer.got_enough_data() and
80                    (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
81                self._mState = constants.eFoundIt
82
83        return self.get_state()
84
85    def get_confidence(self):
86        return self._mDistributionAnalyzer.get_confidence()
87