Class CharDet::UniversalDetector
In: lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb
Parent: Object

Methods

close   feed   new   reset  

Attributes

result  [RW] 

Public Class methods

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 38
    def initialize
      @_highBitDetector = /[\x80-\xFF]/
      @_escDetector = /(\033|\~\{)/
      @_mEscCharSetProber = nil
      @_mCharSetProbers = []
      reset()
    end

Public Instance methods

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 134
    def close
      return if @done
      if not @_mGotData
        $stderr << "no data received!\n" if $debug
        return
      end
      @done = true
        
      if @_mInputState == EPureAscii  
        @result = {'encoding' => 'ascii', 'confidence' => 1.0}
        return @result
      end
        
      if @_mInputState == EHighbyte
        confidences = {}
        @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
        maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
        if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
          @result = {'encoding' =>  maxProber.get_charset_name(),
                     'confidence' =>  maxProber.get_confidence()}
          return @result
        end
      end

      if $debug
        $stderr << "no probers hit minimum threshhold\n" if $debug
        for prober in @_mCharSetProbers[0]._mProbers
          next if not prober
          $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
        end
      end
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 61
    def feed(aBuf)
      return if @done

      aLen = aBuf.length
      return if not aLen

      if not @_mGotData
        # If the data starts with BOM, we know it is UTF
        if aBuf[0...3] == "\xEF\xBB\xBF"
          # EF BB BF  UTF-8 with BOM
          @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
        elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
          # FF FE 00 00  UTF-32, little-endian BOM
          @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
        elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
          # 00 00 FE FF  UTF-32, big-endian BOM
          @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
        elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
          # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
          @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
        elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
          # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
          @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
        elsif aBuf[0...2] == "\xFF\xFE"
          # FF FE  UTF-16, little endian BOM
          @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
        elsif aBuf[0...2] == "\xFE\xFF"
          # FE FF  UTF-16, big endian BOM
          @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
        end
      end
         
      @_mGotData = true
      if @result['encoding'] and (@result['confidence'] > 0.0)  
        @done = true
        return
      end
      
      if @_mInputState == EPureAscii
        if @_highBitDetector =~ (aBuf)
          @_mInputState = EHighbyte
        elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
          @_mInputState = EEscAscii
        end
      end
        
      @_mLastChar = aBuf[-1..-1]
      if @_mInputState == EEscAscii
        if not @_mEscCharSetProber
          @_mEscCharSetProber = EscCharSetProber.new()
        end
        if @_mEscCharSetProber.feed(aBuf) == EFoundIt
          @result = {'encoding' =>  self._mEscCharSetProber.get_charset_name(),
                     'confidence' =>  @_mEscCharSetProber.get_confidence()
          }
          @done = true
        end
      elsif @_mInputState == EHighbyte
        if not @_mCharSetProbers or @_mCharSetProbers.empty?
          @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
        end
        for prober in @_mCharSetProbers
          if prober.feed(aBuf) == EFoundIt
            @result = {'encoding' =>  prober.get_charset_name(),
                       'confidence' =>  prober.get_confidence()}
            @done = true
            break
          end
        end
      end
        
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 46
    def reset
      @result = {'encoding' => nil, 'confidence' => 0.0}
      @done = false
      @_mStart = true
      @_mGotData = false
      @_mInputState = EPureAscii
      @_mLastChar = ''
      if @_mEscCharSetProber
        @_mEscCharSetProber.reset()
      end
      for prober in @_mCharSetProbers
        prober.reset()
      end
    end

[Validate]