Tesseract
3.02
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer) |
void | CleanUp () |
void | DeleteState (BLOB_CHOICE_LIST *choices) |
LanguageModelFlagsType | UpdateState (LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateNgramModelPainPointsFromColumn (int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record) |
void | GenerateProblematicPathPainPointsFromColumn (int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record) |
void | GeneratePainPointsFromColumn (int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record) |
void | GeneratePainPointsFromBestChoice (HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle) |
bool | GeneratePainPoint (int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points) |
bool | AcceptableChoiceFound () |
void | GetWorstPieceCertainty (int col, int row, MATRIX *ratings, float *cert, bool *fragmented) |
float | ComputeOutlineLength (BLOB_CHOICE *b) |
Static Public Attributes | |
static const float | kInitialPainPointPriorityAdjustment = 5.0f |
static const float | kDefaultPainPointPriorityAdjustment = 2.0f |
static const float | kBestChoicePainPointPriorityAdjustment = 0.5f |
static const float | kCriticalPainPointPriorityAdjustment = 0.1f |
static const float | kMaxAvgNgramCost = 25.0f |
static const int | kMinFixedLengthDawgLength = 2 |
static const float | kLooseMaxCharWhRatio = 2.5f |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kConsistentFlag = 0x8 |
static const LanguageModelFlagsType | kDawgFlag = 0x10 |
static const LanguageModelFlagsType | kNgramFlag = 0x20 |
static const LanguageModelFlagsType | kJustClassifiedFlag = 0x80 |
static const LanguageModelFlagsType | kAllChangedFlag = 0xff |
Protected Member Functions | |
float | CertaintyScore (float cert) |
bool | NonAlphaOrDigitMiddle (int col, int row, int dimension, UNICHAR_ID unichar_id) |
bool | IsFragment (BLOB_CHOICE *b) |
bool | IsHan (int script_id) |
void | GetPieceCertainty (BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info) |
float | ComputeConsistencyAdjustedRatingsSum (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse) |
bool | ProblematicPath (const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end) |
void | GetTopChoiceLowerUpper (LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper) |
LanguageModelFlagsType | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | PrintViterbiStateEntry (const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record) |
void | GenerateTopChoiceInfo (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info) |
void | UpdateBestChoice (BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | ExtractRawFeaturesFromPath (const ViterbiStateEntry &vse, float *features) |
WERD_CHOICE * | ConstructWord (BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path) |
void | UpdateCoveredByFixedLengthDawgs (const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats) |
bool | PrunablePath (LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs * | dawg_args_ |
GenericVector< bool * > | updated_flags_ |
float | rating_cert_scale_ |
const UnicityTable< FontInfo > * | fontinfo_table_ |
Dict * | dict_ |
bool | fixed_pitch_ |
float | max_char_wh_ratio_ |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ |
DawgInfoVector * | beginning_active_dawgs_ |
DawgInfoVector * | beginning_constraints_ |
DawgInfoVector * | fixed_length_beginning_active_dawgs_ |
DawgInfoVector * | empty_dawg_info_vec_ |
float | max_penalty_adjust_ |
bool | acceptable_choice_found_ |
bool | correct_segmentation_explored_ |
Definition at line 264 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 43 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 131 of file language_model.cpp.
|
inline |
Definition at line 407 of file language_model.h.
|
inlineprotected |
Definition at line 731 of file language_model.h.
|
protected |
Definition at line 450 of file language_model.cpp.
|
inlineprotected |
Definition at line 440 of file language_model.h.
void tesseract::LanguageModel::CleanUp | ( | ) |
Definition at line 273 of file language_model.cpp.
|
protected |
Definition at line 1152 of file language_model.cpp.
|
inlineprotected |
Definition at line 485 of file language_model.h.
|
inlineprotected |
Definition at line 699 of file language_model.h.
|
inlineprotected |
Definition at line 518 of file language_model.h.
|
inlineprotected |
Definition at line 496 of file language_model.h.
|
protected |
Definition at line 962 of file language_model.cpp.
|
protected |
Definition at line 905 of file language_model.cpp.
|
inline |
Definition at line 434 of file language_model.h.
|
protected |
Definition at line 1375 of file language_model.cpp.
void tesseract::LanguageModel::DeleteState | ( | BLOB_CHOICE_LIST * | choices | ) |
Definition at line 280 of file language_model.cpp.
|
protected |
Definition at line 1330 of file language_model.cpp.
|
protected |
Definition at line 983 of file language_model.cpp.
|
protected |
Definition at line 740 of file language_model.cpp.
|
protected |
Definition at line 844 of file language_model.cpp.
void tesseract::LanguageModel::GenerateNgramModelPainPointsFromColumn | ( | int | col, |
int | row, | ||
HEAP * | pain_points, | ||
CHUNKS_RECORD * | chunks_record | ||
) |
Definition at line 1604 of file language_model.cpp.
bool tesseract::LanguageModel::GeneratePainPoint | ( | int | col, |
int | row, | ||
bool | ok_to_extend, | ||
float | priority_adjustment, | ||
float | worst_piece_cert, | ||
bool | fragmented, | ||
float | best_choice_cert, | ||
float | max_char_wh_ratio, | ||
BLOB_CHOICE * | parent_b, | ||
ViterbiStateEntry * | parent_vse, | ||
CHUNKS_RECORD * | chunks_record, | ||
HEAP * | pain_points | ||
) |
Definition at line 1923 of file language_model.cpp.
void tesseract::LanguageModel::GeneratePainPointsFromBestChoice | ( | HEAP * | pain_points, |
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle | ||
) |
Definition at line 1752 of file language_model.cpp.
void tesseract::LanguageModel::GeneratePainPointsFromColumn | ( | int | col, |
const GenericVector< int > & | non_empty_rows, | ||
float | best_choice_cert, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record | ||
) |
Definition at line 1581 of file language_model.cpp.
void tesseract::LanguageModel::GenerateProblematicPathPainPointsFromColumn | ( | int | col, |
int | row, | ||
float | best_choice_cert, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record | ||
) |
Definition at line 1668 of file language_model.cpp.
|
protected |
Definition at line 705 of file language_model.cpp.
|
inlineprotected |
Definition at line 472 of file language_model.h.
|
protected |
Definition at line 426 of file language_model.cpp.
|
inline |
Definition at line 413 of file language_model.h.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | best_choice_cert, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale, | ||
HEAP * | pain_points, | ||
CHUNKS_RECORD * | chunks_record, | ||
BlamerBundle * | blamer_bundle, | ||
bool | debug_blamer | ||
) |
Definition at line 141 of file language_model.cpp.
|
inlineprotected |
Definition at line 459 of file language_model.h.
|
inlineprotected |
Definition at line 463 of file language_model.h.
|
inlineprotected |
Definition at line 452 of file language_model.h.
|
protected |
Definition at line 669 of file language_model.cpp.
|
protected |
Definition at line 381 of file language_model.cpp.
|
inlineprotected |
Definition at line 719 of file language_model.h.
|
protected |
Definition at line 1184 of file language_model.cpp.
|
protected |
Definition at line 1527 of file language_model.cpp.
LanguageModelFlagsType tesseract::LanguageModel::UpdateState | ( | LanguageModelFlagsType | changed, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
BLOB_CHOICE_LIST * | parent_list, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 292 of file language_model.cpp.
|
protected |
Definition at line 844 of file language_model.h.
|
protected |
Definition at line 827 of file language_model.h.
|
protected |
Definition at line 828 of file language_model.h.
|
protected |
Definition at line 846 of file language_model.h.
|
protected |
Definition at line 786 of file language_model.h.
|
protected |
Definition at line 807 of file language_model.h.
|
protected |
Definition at line 830 of file language_model.h.
|
protected |
Definition at line 829 of file language_model.h.
|
protected |
Definition at line 814 of file language_model.h.
|
protected |
Definition at line 803 of file language_model.h.
|
static |
Definition at line 293 of file language_model.h.
|
static |
Definition at line 269 of file language_model.h.
|
static |
Definition at line 289 of file language_model.h.
|
static |
Definition at line 270 of file language_model.h.
|
static |
Definition at line 290 of file language_model.h.
|
static |
Definition at line 268 of file language_model.h.
|
static |
Definition at line 267 of file language_model.h.
|
static |
Definition at line 292 of file language_model.h.
|
static |
Definition at line 282 of file language_model.h.
|
static |
Definition at line 287 of file language_model.h.
|
static |
Definition at line 274 of file language_model.h.
|
static |
Definition at line 278 of file language_model.h.
|
static |
Definition at line 291 of file language_model.h.
|
static |
Definition at line 286 of file language_model.h.
|
static |
Definition at line 288 of file language_model.h.
int tesseract::LanguageModel::language_model_debug_level = 0 |
"Language model debug level"
Definition at line 738 of file language_model.h.
int tesseract::LanguageModel::language_model_fixed_length_choices_depth = 3 |
"Depth of blob choice lists to explore" " when fixed length dawgs are on"
Definition at line 766 of file language_model.h.
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
"Minimum length of compound words"
Definition at line 763 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 752 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 740 of file language_model.h.
int tesseract::LanguageModel::language_model_ngram_order = 8 |
"Maximum order of the character ngram model"
Definition at line 742 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 758 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 750 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
"Words are delimited by space"
Definition at line 760 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 755 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
"Penalty for inconsistent case"
Definition at line 775 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
"Penalty for inconsistent character type"
Definition at line 779 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
Definition at line 781 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
"Penalty increment"
Definition at line 784 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
"Penalty for non-dictionary words"
Definition at line 771 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 769 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
"Penalty for inconsistent punctuation"
Definition at line 773 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
"Penalty for inconsistent script"
Definition at line 777 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
Definition at line 783 of file language_model.h.
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
"Use sigmoidal score for certainty"
Definition at line 786 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is true)" "entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 745 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 747 of file language_model.h.
|
protected |
Definition at line 817 of file language_model.h.
|
protected |
Definition at line 832 of file language_model.h.
|
protected |
Definition at line 824 of file language_model.h.
|
protected |
Definition at line 825 of file language_model.h.
|
protected |
Definition at line 798 of file language_model.h.
|
protected |
Definition at line 796 of file language_model.h.