Class | CharDet::HebrewProber |
In: |
lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb
|
Parent: | CharSetProber |
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 151 def initialize super() @_mLogicalProber = nil @_mVisualProber = nil reset() end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 192 def feed(aBuf) # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew or # visual Hebrew. # The following cases are checked: # 1) A word longer than 1 letter, ending with a final letter. This is an # indication that the text is laid out "naturally" since the final letter # really appears at the end. +1 for logical score. # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # the Non-Final form of that letter. Exceptions to this rule are mentioned # above in isNonFinal(). This is an indication that the text is laid out # backwards. +1 for visual score # 3) A word longer than 1 letter, starting with a final letter. Final letters # should not appear at the beginning of a word. This is an indication that # the text is laid out backwards. +1 for visual score. # # The visual score and logical score are accumulated throughout the text and # are finally checked against each other in GetCharSetName(). # No checking for final letters in the middle of words is done since that case # is not an indication for either Logical or Visual text. # # We automatically filter out all 7-bit characters (replace them with spaces) # so the word boundary detection works properly. [MAP] if get_state() == ENotMe # Both model probers say it's not them. No reason to continue. return ENotMe end aBuf = filter_high_bit_only(aBuf) for cur in aBuf.split(' ') if cur == ' ' # We stand on a space - a word just ended if @_mBeforePrev != ' ' # next-to-last char was not a space so self._mPrev is not a 1 letter word if is_final(@_mPrev) # case (1) [-2:not space][-1:final letter][cur:space] @_mFinalCharLogicalScore += 1 elsif is_non_final(@_mPrev) # case (2) [-2:not space][-1:Non-Final letter][cur:space] @_mFinalCharVisualScore += 1 end end else # Not standing on a space if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ') # case (3) [-2:space][-1:final letter][cur:not space] @_mFinalCharVisualScore += 1 end end @_mBeforePrev = @_mPrev @_mPrev = cur end # Forever detecting, till the end or until both model probers return eNotMe (handled above) return EDetecting end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 252 def get_charset_name # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore if finalsub >= MIN_FINAL_CHAR_DISTANCE return LOGICAL_HEBREW_NAME end if finalsub <= -MIN_FINAL_CHAR_DISTANCE return VISUAL_HEBREW_NAME end # It's not dominant enough, try to rely on the model scores instead. modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence() if modelsub > MIN_MODEL_DISTANCE return LOGICAL_HEBREW_NAME end if modelsub < -MIN_MODEL_DISTANCE return VISUAL_HEBREW_NAME end # Still no good, back to final letter distance, maybe it'll save the day. if finalsub < 0.0 return VISUAL_HEBREW_NAME end # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. return LOGICAL_HEBREW_NAME end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 281 def get_state # Remain active as long as any of the model probers are active. if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe) return ENotMe end return EDetecting end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 174 def is_final(c) return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c) end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 178 def is_non_final(c) # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters causing # the Non-Final tsadi to appear at an end of a word even though this is not # the case in the original text. # The letters Pe and Kaf rarely display a related behavior of not being a # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # example legally end with a Non-Final Pe or Kaf. However, the benefit of # these letters as Non-Final letters outweighs the damage since these words # are quite rare. return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c) end
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb, line 158 def reset @_mFinalCharLogicalScore = 0 @_mFinalCharVisualScore = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate a word # delimiter at the beginning of the data @_mPrev = ' ' @_mBeforePrev = ' ' # These probers are owned by the group prober. end