Class CharDet::HebrewProber
In: lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb
Parent: CharSetProber

Methods

Public Class methods

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 151
    def initialize
      super()
      @_mLogicalProber = nil
      @_mVisualProber = nil
      reset()
    end

Public Instance methods

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 192
    def feed(aBuf)
      # Final letter analysis for logical-visual decision.
      # Look for evidence that the received buffer is either logical Hebrew or 
      # visual Hebrew.
      # The following cases are checked:
      # 1) A word longer than 1 letter, ending with a final letter. This is an 
      #    indication that the text is laid out "naturally" since the final letter 
      #    really appears at the end. +1 for logical score.
      # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
      #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
      #    the Non-Final form of that letter. Exceptions to this rule are mentioned
      #    above in isNonFinal(). This is an indication that the text is laid out
      #    backwards. +1 for visual score
      # 3) A word longer than 1 letter, starting with a final letter. Final letters 
      #    should not appear at the beginning of a word. This is an indication that 
      #    the text is laid out backwards. +1 for visual score.
      # 
      # The visual score and logical score are accumulated throughout the text and 
      # are finally checked against each other in GetCharSetName().
      # No checking for final letters in the middle of words is done since that case
      # is not an indication for either Logical or Visual text.
      # 
      # We automatically filter out all 7-bit characters (replace them with spaces)
      # so the word boundary detection works properly. [MAP]

      if get_state() == ENotMe
        # Both model probers say it's not them. No reason to continue.
        return ENotMe
      end

      aBuf = filter_high_bit_only(aBuf)

      for cur in aBuf.split(' ')
        if cur == ' '
          # We stand on a space - a word just ended
          if @_mBeforePrev != ' '
            # next-to-last char was not a space so self._mPrev is not a 1 letter word
            if is_final(@_mPrev)
              # case (1) [-2:not space][-1:final letter][cur:space]
              @_mFinalCharLogicalScore += 1
            elsif is_non_final(@_mPrev)
              # case (2) [-2:not space][-1:Non-Final letter][cur:space]
              @_mFinalCharVisualScore += 1
            end
          end
        else
          # Not standing on a space
          if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
            # case (3) [-2:space][-1:final letter][cur:not space]
            @_mFinalCharVisualScore += 1
          end
        end
        @_mBeforePrev = @_mPrev
        @_mPrev = cur
      end

      # Forever detecting, till the end or until both model probers return eNotMe (handled above)
      return EDetecting
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 252
    def get_charset_name
      # Make the decision: is it Logical or Visual?
      # If the final letter score distance is dominant enough, rely on it.
      finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
      if finalsub >= MIN_FINAL_CHAR_DISTANCE
        return LOGICAL_HEBREW_NAME
      end
      if finalsub <= -MIN_FINAL_CHAR_DISTANCE
        return VISUAL_HEBREW_NAME
      end

      # It's not dominant enough, try to rely on the model scores instead.
      modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
      if modelsub > MIN_MODEL_DISTANCE
        return LOGICAL_HEBREW_NAME
      end
      if modelsub < -MIN_MODEL_DISTANCE
        return VISUAL_HEBREW_NAME
      end

      # Still no good, back to final letter distance, maybe it'll save the day.
      if finalsub < 0.0
        return VISUAL_HEBREW_NAME
      end

      # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
      return LOGICAL_HEBREW_NAME
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 281
    def get_state
      # Remain active as long as any of the model probers are active.
      if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
        return ENotMe
      end
      return EDetecting
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 174
    def is_final(c)
      return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 178
    def is_non_final(c)
      # The normal Tsadi is not a good Non-Final letter due to words like 
      # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
      # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
      # the Non-Final tsadi to appear at an end of a word even though this is not 
      # the case in the original text.
      # The letters Pe and Kaf rarely display a related behavior of not being a 
      # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
      # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
      # these letters as Non-Final letters outweighs the damage since these words 
      # are quite rare.
      return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 158
    def reset
      @_mFinalCharLogicalScore = 0
      @_mFinalCharVisualScore = 0
      # The two last characters seen in the previous buffer,
      # mPrev and mBeforePrev are initialized to space in order to simulate a word 
      # delimiter at the beginning of the data
      @_mPrev = ' '
      @_mBeforePrev = ' '
      # These probers are owned by the group prober.
    end

[Source]

# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 169
    def set_model_probers(logicalProber, visualProber)
      @_mLogicalProber = logicalProber
      @_mVisualProber = visualProber
    end

[Validate]