Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

List of all members.

Public Member Functions

 Tesseract ()
 ~Tesseract ()
void Clear ()
void ResetAdaptiveClassifier ()
void ResetDocumentDictionary ()
void SetEquationDetect (EquationDetect *detector)
const FCOORDreskew () const
Pix ** mutable_pix_binary ()
Pix * pix_binary () const
Pix * pix_grey () const
void set_pix_grey (Pix *grey_pix)
Pix * BestPix () const
int source_resolution () const
void set_source_resolution (int ppi)
int ImageWidth () const
int ImageHeight () const
Pix * scaled_color () const
int scaled_factor () const
void SetScaledColor (int factor, Pix *color)
const Textordtextord () const
Textordmutable_textord ()
bool right_to_left () const
int num_sub_langs () const
Tesseractget_sub_lang (int index) const
void SetBlackAndWhitelist ()
void PrepareForPageseg ()
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
void SetupWordScripts (BLOCK_LIST *blocks)
int AutoPageSeg (bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, Tesseract *osd_tess, OSResults *osr)
ColumnFinderSetupPageSegAndDetectOrientation (bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
void bigram_correction_pass (PAGE_RES *page_res)
void blamer_pass (PAGE_RES *page_res)
bool RetryWithLanguage (WERD_RES *word, BLOCK *block, ROW *row, WordRecognizer recognizer)
void classify_word_and_language (WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
void classify_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
void fix_rep_char (PAGE_RES_IT *page_res_it)
void ExplodeRepeatedWord (BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it)
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
void match_word_pass2 (WERD_RES *word, ROW *row, BLOCK *block)
void classify_word_pass2 (BLOCK *block, ROW *row, WERD_RES *word)
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
BOOL8 recog_interactive (BLOCK *block, ROW *row, WERD_RES *word_res)
void set_word_fonts (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
void font_recognition_pass (PAGE_RES *page_res)
BOOL8 check_debug_pt (WERD_RES *word, int location)
bool init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager)
void run_cube_combiner (PAGE_RES *page_res)
void cube_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
CubeObjectcube_recognize_word (BLOCK *block, WERD_RES *word)
void cube_combine_word (CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
bool cube_recognize (CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
void fill_werd_res (const BoxWord &cube_box_word, WERD_CHOICE *cube_werd_choice, const char *cube_best_str, WERD_RES *tess_werd_res)
bool extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
bool create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
void set_unlv_suspects (WERD_RES *word)
UNICHAR_ID get_rep_char (WERD_RES *word)
BOOL8 acceptable_number_string (const char *s, const char *lengths)
inT16 count_alphanums (const WERD_CHOICE &word)
inT16 count_alphas (const WERD_CHOICE &word)
void read_config_file (const char *filename, SetParamConstraint constraint)
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
void SetupUniversalFontIds ()
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language)
void recognize_page (STRING &image_name)
void end_tesseract ()
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
SVMenuNodebuild_menu_new ()
void pgeditor_main (int width, int height, PAGE_RES *page_res)
void process_image_event (const SVEvent &event)
BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res))
BOOL8 word_display (BLOCK *block, ROW *row, WERD_RES *word_res)
BOOL8 word_bln_display (BLOCK *block, ROW *row, WERD_RES *word_res)
BOOL8 word_blank_and_set_display (BLOCK *block, ROW *row, WERD_RES *word_res)
BOOL8 word_set_display (BLOCK *block, ROW *row, WERD_RES *word_res)
BOOL8 word_dumper (BLOCK *block, ROW *row, WERD_RES *word_res)
void make_reject_map (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass)
BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)
inT16 first_alphanum_index (const char *word, const char *word_lengths)
inT16 first_alphanum_offset (const char *word, const char *word_lengths)
inT16 alpha_count (const char *word, const char *word_lengths)
BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)
void dont_allow_1Il (WERD_RES *word)
inT16 count_alphanums (WERD_RES *word)
void flip_0O (WERD_RES *word)
BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
void nn_match_word (WERD_RES *word, ROW *row)
void nn_recover_rejects (WERD_RES *word, ROW *row)
BOOL8 test_ambig_word (WERD_RES *word)
void set_done (WERD_RES *word, inT16 pass)
inT16 safe_dict_word (const WERD_RES *werd_res)
void flip_hyphens (WERD_RES *word)
void reject_I_1_L (WERD_RES *word)
void reject_edge_blobs (WERD_RES *word)
void reject_mostly_rejects (WERD_RES *word)
BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)
void recog_word_recursive (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
void recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
void split_and_recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
void tilde_crunch (PAGE_RES_IT &page_res_it)
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
void convert_bad_unlv_chs (WERD_RES *word_res)
void tilde_delete (PAGE_RES_IT &page_res_it)
inT16 word_blob_quality (WERD_RES *word, ROW *row)
void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
void unrej_good_chs (WERD_RES *word, ROW *row)
inT16 count_outline_errs (char c, inT16 outline_count)
inT16 word_outline_errs (WERD_RES *word)
BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)
inT16 failure_count (WERD_RES *word)
BOOL8 noise_outlines (TWERD *word)
void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(BLOCK *block, ROW *row, WERD_RES *word_res))
void tess_segment_pass1 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
void ReSegmentByClassification (PAGE_RES *page_res)
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
void TidyUp (PAGE_RES *page_res)
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
void CorrectClassifyWords (PAGE_RES *page_res)
void ApplyBoxTraining (const STRING &filename, PAGE_RES *page_res)
int CountMisfitTops (WERD_RES *word_res)
float ComputeCompatibleXheight (WERD_RES *word_res)
FILE * init_recog_training (const STRING &fname)
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void ambigs_classify_and_output (WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file)
CubeRecoContextGetCubeRecoContext ()
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)
inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
float blob_noise_score (TBLOB *blob)
void break_noisiest_blob_word (WERD_RES_LIST &words)
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters:
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
uniformly_spaced()

Return true if one of the following are true:

  • All inter-char gaps are the same width
  • The largest gap is no larger than twice the mean/median of the others
  • The largest gap is < normalised_max_nonspace **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
BOOL8 uniformly_spaced (WERD_RES *word)
BOOL8 fixspace_thinks_word_done (WERD_RES *word)
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
tess_segment_pass2

Segment a word using the pass2 conditions of the tess segmenter.

Parameters:
wordword to do
blob_choiceslist of blob lists
void tess_segment_pass2 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
tess_acceptable_word
Returns:
true if the word is regarded as "good enough".
Parameters:
word_choiceafter context
raw_choicebefore context
BOOL8 tess_acceptable_word (WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
virtual ~Wordrec ()
void CopyCharChoices (const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
bool ChoiceIsCorrect (const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
void FillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
void CallFillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
void update_ratings (const BLOB_CHOICE_LIST_VECTOR &new_choices, const CHUNKS_RECORD *chunks_record, const SEARCH_STATE search_state)
void SegSearch (CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list)
void junk_worst_seam (SEAM_QUEUE seams, SEAM *new_seam, float new_priority)
void choose_best_seam (SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
void combine_seam (SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam)
inT16 constrained_split (SPLIT *split, TBLOB *blob)
void delete_seam_pile (SEAM_PILE seam_pile)
SEAMpick_good_seam (TBLOB *blob)
PRIORITY seam_priority (SEAM *seam, inT16 xmin, inT16 xmax)
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
PRIORITY full_split_priority (SPLIT *split, inT16 xmin, inT16 xmax)
PRIORITY grade_center_of_blob (register BOUNDS_RECT rect)
PRIORITY grade_overlap (register BOUNDS_RECT rect)
PRIORITY grade_split_length (register SPLIT *split)
PRIORITY grade_sharpness (register SPLIT *split)
PRIORITY grade_width_change (register BOUNDS_RECT rect)
void set_outline_bounds (register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect)
int crosses_outline (EDGEPT *p0, EDGEPT *p1, EDGEPT *outline)
int is_crossed (TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1)
int is_same_edgept (EDGEPT *p1, EDGEPT *p2)
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
void reverse_outline (EDGEPT *outline)
virtual BLOB_CHOICE_LIST * classify_piece (TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
BLOB_CHOICE_LIST * get_piece_rating (MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
TBOXrecord_blob_bounds (TBLOB *blobs)
MATRIXrecord_piece_ratings (TBLOB *blobs)
WIDTH_RECORDstate_char_widths (WIDTH_RECORD *chunk_widths, STATE *state, int num_joints)
FLOAT32 get_width_variance (WIDTH_RECORD *wrec, float norm_height)
FLOAT32 get_gap_variance (WIDTH_RECORD *wrec, float norm_height)
FLOAT32 prioritize_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
FLOAT32 width_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
FLOAT32 seamcut_priority (SEAMS seams, STATE *state, int num_joints)
FLOAT32 rating_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
void program_editup (const char *textbase, bool init_classifier, bool init_permute)
BLOB_CHOICE_LIST_VECTORcc_recog (WERD_RES *word)
void program_editdown (inT32 elasped_time)
void set_pass1 ()
void set_pass2 ()
int end_recog ()
BLOB_CHOICE_LIST * call_matcher (const DENORM *denorm, TBLOB *blob)
int dict_word (const WERD_CHOICE &word)
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
BLOB_CHOICE_LIST * fake_classify_blob (UNICHAR_ID class_id, float rating, float certainty)
void update_blob_classifications (TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)
BLOB_CHOICE_LIST_VECTORevaluate_chunks (CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle)
void best_first_search (CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state)
void delete_search (SEARCH_RECORD *the_search)
inT16 evaluate_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle)
BLOB_CHOICE_LIST_VECTORrebuild_current_state (WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings)
SEARCH_RECORDnew_search (CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state)
void expand_node (FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
void replace_char_widths (CHUNKS_RECORD *chunks_record, SEARCH_STATE state)
BLOB_CHOICErebuild_fragments (const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices)
BLOB_CHOICE_LIST * join_blobs_and_classify (WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices)
STATEpop_queue (HEAP *queue)
void push_queue (HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug)
PRIORITY point_priority (EDGEPT *point)
void add_point_to_list (POINT_GROUP point_list, EDGEPT *point)
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
int is_little_chunk (EDGEPT *point1, EDGEPT *point2)
int is_small_area (EDGEPT *point1, EDGEPT *point2)
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
void prioritize_points (TESSLINE *outline, POINT_GROUP points)
void new_min_point (EDGEPT *local_min, POINT_GROUP points)
void new_max_point (EDGEPT *local_max, POINT_GROUP points)
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
bool improve_one_blob (WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle)
void modify_blob_choice (BLOB_CHOICE_LIST *answer, int chop_index)
bool chop_one_blob (TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index)
bool chop_one_blob2 (const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list)
BLOB_CHOICE_LIST_VECTORchop_word_main (WERD_RES *word)
void improve_by_chopping (WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice)
MATRIXword_associator (bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state)
inT16 select_blob_to_split (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)
inT16 select_blob_to_split_from_fixpt (DANGERR *fixpt)
void set_chopper_blame (WERD_RES *word)
- Public Member Functions inherited from tesseract::Classify
 Classify ()
virtual ~Classify ()
DictgetDict ()
const ShapeTableshape_table () const
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, CP_RESULT_STRUCT *results)
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
void FreeNormProtos ()
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
void LearnWord (const char *filename, const char *rejmap, WERD_RES *word)
void LearnPieces (const char *filename, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
void InitAdaptiveClassifier (bool load_pre_trained_templates)
void InitAdaptedClass (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
void AdaptToPunc (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void AmbigClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS *Classes, UNICHAR_ID *Ambiguities, ADAPT_RESULTS *Results)
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS *final_results)
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 *cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void AddNewResult (ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
void DebugAdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
void GetAdaptThresholds (TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
void PrintAdaptiveMatchResults (FILE *File, ADAPT_RESULTS *Results)
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
void RemoveBadMatches (ADAPT_RESULTS *Results)
void SetAdaptiveThreshold (FLOAT32 Threshold)
void ShowBestMatchFor (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS *Results)
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
int ShapeIDToClassID (int shape_id) const
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormTrainingSample (bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
UNICHAR_IDGetAmbiguities (TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
void DoAdaptiveMatch (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
void AdaptToChar (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void DisplayAdaptedChar (TBLOB *blob, const DENORM &denorm, INT_CLASS_STRUCT *int_class)
int AdaptableWord (TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
void EndAdaptiveClassifier ()
void PrintAdaptiveStatistics (FILE *File)
void SettupPass1 ()
void SettupPass2 ()
void AdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
void ClassifyAsNoise (ADAPT_RESULTS *Results)
void ResetAdaptiveClassifierInternal ()
int GetBaselineFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *CharNormArray, inT32 *BlobLength)
int GetCharNormFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *PrunerNormArray, uinT8 *CharNormArray, inT32 *BlobLength, inT32 *FeatureOutlineIndex)
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
void UpdateAmbigsGroup (CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
void ResetFeaturesHaveBeenExtracted ()
bool AdaptiveClassifierIsFull ()
bool LooksLikeGarbage (const DENORM &denorm, TBLOB *blob)
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
void ClearCharNormArray (uinT8 *char_norm_array)
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
INT_TEMPLATES ReadIntTemplates (FILE *File)
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
void ShowMatchDisplay ()
UnicityTable< FontInfo > & get_fontinfo_table ()
UnicityTable< FontSet > & get_fontset_table ()
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
void ReadClassFile ()
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 ~CCStruct ()
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 ~CUtil ()
void read_variables (const char *filename, bool global_only)
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
virtual ~CCUtil ()
void main_setup (const char *argv0, const char *basename)
ParamsVectorsparams ()

Public Attributes

bool tessedit_resegment_from_boxes = false
bool tessedit_resegment_from_line_boxes = false
bool tessedit_train_from_boxes = false
bool tessedit_make_boxes_from_boxes = false
bool tessedit_dump_pageseg_images = false
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
int tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY
char * tessedit_char_blacklist = ""
char * tessedit_char_whitelist = ""
bool tessedit_ambigs_training = false
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
char * tessedit_write_params_to_file = ""
bool tessedit_adapt_to_char_fragments = true
bool tessedit_adaption_debug = false
int bidi_debug = 0
int applybox_debug = 1
int applybox_page = 0
char * applybox_exposure_pattern = ".exp"
bool applybox_learn_chars_and_char_frags_mode = false
bool applybox_learn_ngrams_mode = false
bool tessedit_display_outwords = false
bool tessedit_training_tess = false
bool tessedit_dump_choices = false
bool tessedit_fix_fuzzy_spaces = true
bool tessedit_unrej_any_wd = false
bool tessedit_fix_hyphens = true
bool tessedit_redo_xheight = true
bool tessedit_enable_doc_dict = true
bool tessedit_debug_fonts = false
bool tessedit_debug_block_rejection = false
bool tessedit_enable_bigram_correction = false
int tessedit_bigram_debug = 0
int debug_x_ht_level = 0
bool debug_acceptable_wds = false
char * chs_leading_punct = "('`\""
char * chs_trailing_punct1 = ").,;:?!"
char * chs_trailing_punct2 = ")'`\""
double quality_rej_pc = 0.08
double quality_blob_pc = 0.0
double quality_outline_pc = 1.0
double quality_char_pc = 0.95
int quality_min_initial_alphas_reqd = 2
bool tessedit_tess_adapt_to_rejmap = false
int tessedit_tess_adaption_mode = 0x27
bool tessedit_minimal_rej_pass1 = false
bool tessedit_test_adaption = false
bool tessedit_matcher_log = false
int tessedit_test_adaption_mode = 3
bool save_blob_choices = false
bool test_pt = false
double test_pt_x = 99999.99
double test_pt_y = 99999.99
int paragraph_debug_level = 0
int cube_debug_level = 1
char * outlines_odd = "%| "
char * outlines_2 = "ij!?%\":;"
bool docqual_excuse_outline_errs = false
bool tessedit_good_quality_unrej = true
bool tessedit_use_reject_spaces = true
double tessedit_reject_doc_percent = 65.00
double tessedit_reject_block_percent = 45.00
double tessedit_reject_row_percent = 40.00
double tessedit_whole_wd_rej_row_percent = 70.00
bool tessedit_preserve_blk_rej_perfect_wds = true
bool tessedit_preserve_row_rej_perfect_wds = true
bool tessedit_dont_blkrej_good_wds = false
bool tessedit_dont_rowrej_good_wds = false
int tessedit_preserve_min_wd_len = 2
bool tessedit_row_rej_good_docs = true
double tessedit_good_doc_still_rowrej_wd = 1.1
bool tessedit_reject_bad_qual_wds = true
bool tessedit_debug_doc_rejection = false
bool tessedit_debug_quality_metrics = false
bool bland_unrej = false
double quality_rowrej_pc = 1.1
bool unlv_tilde_crunching = true
bool crunch_early_merge_tess_fails = true
bool crunch_early_convert_bad_unlv_chs = false
double crunch_terrible_rating = 80.0
bool crunch_terrible_garbage = true
double crunch_poor_garbage_cert = -9.0
double crunch_poor_garbage_rate = 60
double crunch_pot_poor_rate = 40
double crunch_pot_poor_cert = -8.0
bool crunch_pot_garbage = true
double crunch_del_rating = 60
double crunch_del_cert = -10.0
double crunch_del_min_ht = 0.7
double crunch_del_max_ht = 3.0
double crunch_del_min_width = 3.0
double crunch_del_high_word = 1.5
double crunch_del_low_word = 0.5
double crunch_small_outlines_size = 0.6
int crunch_rating_max = 10
int crunch_pot_indicators = 1
bool crunch_leave_ok_strings = true
bool crunch_accept_ok = true
bool crunch_leave_accept_strings = false
bool crunch_include_numerals = false
int crunch_leave_lc_strings = 4
int crunch_leave_uc_strings = 4
int crunch_long_repetitions = 3
int crunch_debug = 0
int fixsp_non_noise_limit = 1
double fixsp_small_outlines_size = 0.28
bool tessedit_prefer_joined_punct = false
int fixsp_done_mode = 1
int debug_fix_space_level = 0
char * numeric_punctuation = ".,"
int x_ht_acceptance_tolerance = 8
int x_ht_min_change = 8
bool tessedit_write_block_separators = false
bool tessedit_write_rep_codes = false
bool tessedit_write_unlv = false
bool tessedit_create_hocr = false
char * unrecognised_char = "|"
int suspect_level = 99
int suspect_space_level = 100
int suspect_short_words = 2
bool suspect_constrain_1Il = false
double suspect_rating_per_ch = 999.9
double suspect_accept_rating = -999.9
bool tessedit_minimal_rejection = false
bool tessedit_zero_rejection = false
bool tessedit_word_for_word = false
bool tessedit_zero_kelvin_rejection = false
bool tessedit_consistent_reps = true
int tessedit_reject_mode = 0
int tessedit_ok_mode = 5
bool tessedit_rejection_debug = false
bool tessedit_flip_0O = true
double tessedit_lower_flip_hyphen = 1.5
double tessedit_upper_flip_hyphen = 1.8
bool rej_trust_doc_dawg = false
bool rej_1Il_use_dict_word = false
bool rej_1Il_trust_permuter_type = true
bool rej_use_tess_accepted = true
bool rej_use_tess_blanks = true
bool rej_use_good_perm = true
bool rej_use_sensible_wd = false
bool rej_alphas_in_number_perm = false
double rej_whole_of_mostly_reject_word_fract = 0.85
int tessedit_image_border = 2
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
char * conflict_set_I_l_1 = "Il1[]"
int min_sane_x_ht_pixels = 8
bool tessedit_create_boxfile = false
int tessedit_page_number = -1
bool tessedit_write_images = false
bool interactive_display_mode = false
char * file_type = ".tif"
bool tessedit_override_permuter = true
int tessdata_manager_debug_level = 0
char * tessedit_load_sublangs = ""
double min_orientation_margin = 7.0
bool textord_tabfind_show_vlines = false
bool textord_use_cjk_fp_model = FALSE
bool tessedit_init_config_only = false
bool textord_equation_detect = false
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
bool wordrec_no_block = FALSE
bool wordrec_enable_assoc = TRUE
bool force_word_assoc = FALSE
int wordrec_num_seg_states = 30
double wordrec_worst_state = 1
bool fragments_guide_chopper = FALSE
int repair_unchopped_blobs = 1
double tessedit_certainty_threshold = -2.25
int chop_debug = 0
bool chop_enable = 1
bool chop_vertical_creep = 0
int chop_split_length = 10000
int chop_same_distance = 2
int chop_min_outline_points = 6
int chop_inside_angle = -50
int chop_min_outline_area = 2000
double chop_split_dist_knob = 0.5
double chop_overlap_knob = 0.9
double chop_center_knob = 0.15
double chop_sharpness_knob = 0.06
double chop_width_change_knob = 5.0
double chop_ok_split = 100.0
double chop_good_split = 50.0
int chop_x_y_weight = 3
int segment_adjust_debug = 0
bool assume_fixed_pitch_char_segment = FALSE
bool use_new_state_cost = FALSE
double heuristic_segcost_rating_base = 1.25
double heuristic_weight_rating = 1
double heuristic_weight_width = 0
double heuristic_weight_seamcut = 0
double heuristic_max_char_wh_ratio = 2.0
int wordrec_debug_level = 0
bool wordrec_debug_blamer = false
bool wordrec_run_blamer = false
bool enable_new_segsearch = false
int segsearch_debug_level = 0
int segsearch_max_pain_points = 2000
int segsearch_max_futile_classifications = 10
double segsearch_max_char_wh_ratio = 2.0
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
bool save_alt_choices = false
LanguageModellanguage_model_
PRIORITY pass2_ok_split
int pass2_seg_states
int num_joints
int num_pushed
int num_popped
BlobMatchTable blob_match_table
EVALUATION_ARRAY last_segmentation
WERD_CHOICEprev_word_best_choice_
GenericVector< int > blame_reasons_
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
- Public Attributes inherited from tesseract::Classify
bool prioritize_division = FALSE
int tessedit_single_match = FALSE
bool classify_enable_learning = true
int classify_debug_level = 0
int classify_norm_method = character
double classify_char_norm_range = 0.2
double classify_min_norm_scale_x = 0.0
double classify_max_norm_scale_x = 0.325
double classify_min_norm_scale_y = 0.0
double classify_max_norm_scale_y = 0.325
bool tess_cn_matching = 0
bool tess_bn_matching = 0
bool classify_enable_adaptive_matcher = 1
bool classify_use_pre_adapted_templates = 0
bool classify_save_adapted_templates = 0
bool classify_enable_adaptive_debugger = 0
int matcher_debug_level = 0
int matcher_debug_flags = 0
int classify_learning_debug_level = 0
double matcher_good_threshold = 0.125
double matcher_great_threshold = 0.0
double matcher_perfect_threshold = 0.02
double matcher_bad_match_pad = 0.15
double matcher_rating_margin = 0.1
double matcher_avg_noise_size = 12.0
int matcher_permanent_classes_min = 1
int matcher_min_examples_for_prototyping = 3
int matcher_sufficient_examples_for_prototyping = 5
double matcher_clustering_max_angle_delta = 0.015
double classify_misfit_junk_penalty = 0.0
double rating_scale = 1.5
double certainty_scale = 20.0
double tessedit_class_miss_scale = 0.00390625
int classify_adapt_proto_threshold = 230
int classify_adapt_feature_threshold = 230
bool disable_character_fragments = TRUE
double classify_character_fragments_garbage_certainty_threshold = -3.0
bool classify_debug_character_fragments = FALSE
bool matcher_debug_separate_windows = FALSE
char * classify_learn_debug_str = ""
int classify_class_pruner_threshold = 229
int classify_class_pruner_multiplier = 30
int classify_cp_cutoff_strength = 7
int classify_integer_matcher_multiplier = 14
INT_TEMPLATES PreTrainedTemplates
ADAPT_TEMPLATES AdaptedTemplates
BIT_VECTOR AllProtosOn
BIT_VECTOR PrunedProtos
BIT_VECTOR AllConfigsOn
BIT_VECTOR AllProtosOff
BIT_VECTOR AllConfigsOff
BIT_VECTOR TempProtoMask
bool EnableLearning
NORM_PROTOSNormProtos
UnicityTable< FontInfofontinfo_table_
UnicityTable< FontSetfontset_table_
int il1_adaption_test = 0
bool classify_bln_numeric_mode = 0

Additional Inherited Members

- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
void UpdateSegSearchNodes (int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)
void InitBlamerForSegSearch (const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
void FinishBlamerForSegSearch (const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)

Detailed Description

Definition at line 139 of file tesseractclass.h.


Constructor & Destructor Documentation

tesseract::Tesseract::Tesseract ( )

Definition at line 37 of file tesseractclass.cpp.

"Take segmentation and labeling from box file",
this->params()),
"Conversion of word/line box file to char box file",
this->params()),
"Generate training data from boxed chars", this->params()),
"Generate more boxes from boxed chars", this->params()),
"Dump intermediate images made during page segmentation",
this->params()),
// The default for pageseg_mode is the old behaviour, so as not to
// upset anything that relies on that.
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
" 5=line, 6=word, 7=char"
" (Values from PageSegMode enum in publictypes.h)",
this->params()),
"Which OCR engine(s) to run (Tesseract, Cube, both)."
" Defaults to loading and running only Tesseract"
" (no Cube,no combiner)."
" Values from OcrEngineMode enum in tesseractclass.h)",
this->params()),
"Blacklist of chars not to recognize", this->params()),
"Whitelist of chars to recognize", this->params()),
"Perform training for ambiguities", this->params()),
"Whether to use the top-line splitting process for Devanagari "
"documents while performing page-segmentation.", this->params()),
"Whether to use the top-line splitting process for Devanagari "
"documents while performing ocr.", this->params()),
"Write all parameters to the given file.", this->params()),
"Adapt to words that contain "
" a character composed form fragments", this->params()),
BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
" information for adaption", this->params()),
INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
"Page number to apply boxes from", this->params()),
STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
" this pattern in the image filename. The name of the image"
" files are expected to be in the form"
" [lang].[fontname].exp[num].tif", this->params()),
"Learn both character fragments (as is done in the"
" special low exposure mode) as well as unfragmented"
" characters.", this->params()),
BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
" is assumed to contain ngrams. Only learn the ngrams"
" whose outlines overlap horizontally.", this->params()),
"Draw output words", this->params()),
"Call Tess to learn blobs", this->params()),
"Dump char choices", this->params()),
"Try to improve fuzzy spaces", this->params()),
"Dont bother with word plausibility", this->params()),
"Crunch double hyphens?", this->params()),
"Check/Correct x-height", this->params()),
"Add words to the document dictionary", this->params()),
"Output font info per char", this->params()),
"Block and Row stats", this->params()),
"Enable correction based on the word bigram dictionary.",
this->params()),
"Amount of debug output for bigram correction.",
this->params()),
INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
"Dump word pass/fail chk", this->params()),
"Leading punctuation", this->params()),
"1st Trailing punctuation", this->params()),
"2nd Trailing punctuation", this->params()),
"good_quality_doc lte rejection limit", this->params()),
"good_quality_doc gte good blobs limit", this->params()),
"good_quality_doc lte outline error limit", this->params()),
"good_quality_doc gte good char limit", this->params()),
"alphas in a good word", this->params()),
"Use reject map to control Tesseract adaption", this->params()),
"Adaptation decision algorithm for tess", this->params()),
"Do minimal rejection on pass 1 output", this->params()),
"Test adaption criteria", this->params()),
"Log matcher activity", this->params()),
"Adaptation decision algorithm for tess", this->params()),
"Save the results of the recognition step (blob_choices)"
" within the corresponding WERD_CHOICE", this->params()),
BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
this->params()),
INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
this->params()),
STRING_MEMBER(outlines_2, "ij!?%\":;",
"Non standard number of outlines", this->params()),
"Allow outline errs in unrejection?", this->params()),
"Reduce rejection on good docs", this->params()),
"Reject spaces?", this->params()),
"%rej allowed before rej whole doc", this->params()),
"%rej allowed before rej whole block", this->params()),
"%rej allowed before rej whole row", this->params()),
"Number of row rejects in whole word rejects"
"which prevents whole row rejection", this->params()),
"Only rej partially rejected words in block rejection",
this->params()),
"Only rej partially rejected words in row rejection",
this->params()),
"Use word segmentation quality metric", this->params()),
"Use word segmentation quality metric", this->params()),
"Only preserve wds longer than this", this->params()),
"Apply row rejection to good docs", this->params()),
"rej good doc wd if more than this fraction rejected",
this->params()),
"Reject all bad quality wds", this->params()),
"Page stats", this->params()),
"Output data to debug file", this->params()),
"unrej potential with no chekcs", this->params()),
"good_quality_doc gte good char limit", this->params()),
"Mark v.bad words for tilde crunch", this->params()),
"Before word crunch?", this->params()),
"Take out ~^ early?", this->params()),
"crunch rating lt this", this->params()),
BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
"crunch garbage cert lt this", this->params()),
"crunch garbage rating lt this", this->params()),
"POTENTIAL crunch rating lt this", this->params()),
"POTENTIAL crunch cert lt this", this->params()),
"POTENTIAL crunch garbage", this->params()),
"POTENTIAL crunch rating lt this", this->params()),
"POTENTIAL crunch cert lt this", this->params()),
"Del if word ht lt xht x this", this->params()),
"Del if word ht gt xht x this", this->params()),
"Del if word width lt xht x this", this->params()),
"Del if word gt xht x this above bl", this->params()),
"Del if word gt xht x this below bl", this->params()),
"Small if lt xht x this", this->params()),
"For adj length in rating per ch", this->params()),
"How many potential indicators needed", this->params()),
"Dont touch sensible strings", this->params()),
"Use acceptability in okstring", this->params()),
"Dont pot crunch sensible strings", this->params()),
"Fiddle alpha figures", this->params()),
"Dont crunch words with long lower case strings",
this->params()),
"Dont crunch words with long lower case strings",
this->params()),
"Crunch words with long repetitions", this->params()),
INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
"How many non-noise blbs either side?", this->params()),
"Small if lt xht x this", this->params()),
"Reward punctation joins", this->params()),
"What constitues done for spacing", this->params()),
"Contextual fixspace debug", this->params()),
"Punct. chs expected WITHIN numbers", this->params()),
"Max allowed deviation of blob top outside of font data",
this->params()),
"Min change in xht before actually trying it", this->params()),
"Write block separators in output", this->params()),
"Write repetition char code", this->params()),
"Write .unlv output file", this->params()),
"Write .html hOCR output file", this->params()),
"Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
"Min suspect level for rejecting spaces", this->params()),
"Dont Suspect dict wds longer than this", this->params()),
"UNLV keep 1Il chars rejected", this->params()),
"Dont touch bad rating limit", this->params()),
"Accept good rating limit", this->params()),
"Only reject tess failures", this->params()),
"Dont reject ANYTHING", this->params()),
"Make output have exactly one word per WERD", this->params()),
"Dont reject ANYTHING AT ALL", this->params()),
"Force all rep chars the same", this->params()),
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
"Acceptance decision algorithm", this->params()),
"Adaption debug", this->params()),
"Contextual 0O O0 flips", this->params()),
"Aspect ratio dot/hyphen test", this->params()),
"Aspect ratio dot/hyphen test", this->params()),
"Use DOC dawg in 11l conf. detector", this->params()),
"Use dictword test", this->params()),
"Dont double check", this->params()),
"Individual rejection control", this->params()),
"Individual rejection control", this->params()),
"Individual rejection control", this->params()),
"Extend permuter check", this->params()),
"Extend permuter check", this->params()),
"if >this fract", this->params()),
"Rej blbs near image edge limit", this->params()),
"Allow NN to unrej", this->params()),
"Il1 conflict set", this->params()),
"Reject any x-ht lt or eq than this", this->params()),
"Output text with boxes", this->params()),
INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
" , else specifc page to process", this->params()),
"Capture the image from the IPE", this->params()),
BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
this->params()),
STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
"According to dict_word", this->params()),
" TessdataManager functions.", this->params()),
"List of languages to load with this one", this->params()),
"Min acceptable orientation margin", this->params()),
BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
this->params()),
BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
this->params()),
"Only initialize with the config file. Useful if the "
"instance is not going to be used for OCR but say only "
"for layout analysis.", this->params()),
BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
this->params()),
backup_config_file_(NULL),
pix_binary_(NULL),
cube_binary_(NULL),
pix_grey_(NULL),
source_resolution_(0),
textord_(this),
right_to_left_(false),
scaled_color_(NULL),
scaled_factor_(-1),
deskew_(1.0f, 0.0f),
reskew_(1.0f, 0.0f),
most_recently_used_(this),
font_table_size_(0),
cube_cntxt_(NULL),
tess_cube_combiner_(NULL),
equ_detect_(NULL) {
}
tesseract::Tesseract::~Tesseract ( )

Definition at line 398 of file tesseractclass.cpp.

{
Clear();
sub_langs_.delete_data_pointers();
// Delete cube objects.
if (cube_cntxt_ != NULL) {
delete cube_cntxt_;
cube_cntxt_ = NULL;
}
if (tess_cube_combiner_ != NULL) {
delete tess_cube_combiner_;
tess_cube_combiner_ = NULL;
}
}

Member Function Documentation

BOOL8 tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 485 of file output.cpp.

{
BOOL8 prev_digit = FALSE;
if (*lengths == 1 && *s == '(')
s++;
if (*lengths == 1 &&
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
s++;
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isdigit(s, *lengths))
prev_digit = TRUE;
else if (prev_digit &&
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
prev_digit = FALSE;
else if (prev_digit && *lengths == 1 &&
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
return TRUE;
else if (prev_digit &&
*lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0'))
return TRUE;
else
return FALSE;
}
return TRUE;
}
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1284 of file control.cpp.

{
int i = 0;
int offset = 0;
int leading_punct_count;
int upper_count = 0;
int hyphen_pos = -1;
if (strlen (lengths) > 20)
return word_type;
/* Single Leading punctuation char*/
if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
offset += lengths[i++];
leading_punct_count = i;
/* Initial cap */
while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
offset += lengths[i++];
upper_count++;
}
if (upper_count > 1) {
word_type = AC_UPPER_CASE;
} else {
/* Lower case word, possibly with an initial cap */
while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
offset += lengths[i++];
}
if (i - leading_punct_count < quality_min_initial_alphas_reqd)
goto not_a_word;
/*
Allow a single hyphen in a lower case word
- dont trust upper case - I've seen several cases of "H" -> "I-I"
*/
if (lengths[i] == 1 && s[offset] == '-') {
hyphen_pos = i;
offset += lengths[i++];
if (s[offset] != '\0') {
while ((s[offset] != '\0') &&
char_set.get_islower(s + offset, lengths[i])) {
offset += lengths[i++];
}
if (i < hyphen_pos + 3)
goto not_a_word;
}
} else {
/* Allow "'s" in NON hyphenated lower case words */
if (lengths[i] == 1 && (s[offset] == '\'') &&
lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
offset += lengths[i++];
offset += lengths[i++];
}
}
if (upper_count > 0)
word_type = AC_INITIAL_CAP;
else
word_type = AC_LOWER_CASE;
}
/* Up to two different, constrained trailing punctuation chars */
if (lengths[i] == 1 && s[offset] != '\0' &&
offset += lengths[i++];
if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
s[offset - lengths[i - 1]] != s[offset] &&
offset += lengths[i++];
if (s[offset] != '\0')
word_type = AC_UNACCEPTABLE;
not_a_word:
if (word_type == AC_UNACCEPTABLE) {
/* Look for abbreviation string */
i = 0;
offset = 0;
if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
word_type = AC_UC_ABBREV;
while (s[offset] != '\0' &&
char_set.get_isupper(s + offset, lengths[i]) &&
lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
offset += lengths[i++];
offset += lengths[i++];
}
}
else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
word_type = AC_LC_ABBREV;
while (s[offset] != '\0' &&
char_set.get_islower(s + offset, lengths[i]) &&
lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
offset += lengths[i++];
offset += lengths[i++];
}
}
if (s[offset] != '\0')
word_type = AC_UNACCEPTABLE;
}
return word_type;
}
inT16 tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 659 of file reject.cpp.

{
inT16 i;
inT16 offset;
inT16 count = 0;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha (word + offset, word_lengths[i]))
count++;
}
return count;
}
void tesseract::Tesseract::ambigs_classify_and_output ( WERD_RES werd_res,
ROW_RES row_res,
BLOCK_RES block_res,
const char *  label,
FILE *  output_file 
)

Definition at line 163 of file recogtraining.cpp.

{
int offset;
// Classify word.
fflush(stdout);
classify_word_pass1(block_res->block, row_res->row, werd_res);
WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != NULL);
ASSERT_HOST(best_choice->blob_choices() != NULL);
// Compute the number of unichars in the label.
int label_num_unichars = 0;
int step = 1; // should be non-zero on the first iteration
for (offset = 0; label[offset] != '\0' && step > 0;
step = werd_res->uch_set->step(label + offset),
offset += step, ++label_num_unichars);
if (step == 0) {
tprintf("Not outputting illegal unichar %s\n", label);
return;
}
// Output all classifier choices for the unigrams (1->1 classifications).
if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
outer_blob_choice_it.set_to_list(best_choice->blob_choices());
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(outer_blob_choice_it.data());
for (blob_choice_it.mark_cycle_pt();
!blob_choice_it.cycled_list();
blob_choice_it.forward()) {
BLOB_CHOICE *blob_choice = blob_choice_it.data();
if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
label, blob_choice->rating(), blob_choice->certainty());
}
}
}
// Output raw choices for many->many and 1->many classifications.
getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
}
PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 111 of file applybox.cpp.

{
int box_count = 0;
int box_failures = 0;
FILE* box_file = OpenBoxFile(fname);
TBOX box;
GenericVector<STRING> texts, full_texts;
bool found_box = true;
while (found_box) {
int line_number = 0; // Line number of the box file.
STRING text, full_text;
found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
if (found_box) {
++box_count;
MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
} else {
full_text = "";
}
boxes.push_back(box);
texts.push_back(text);
full_texts.push_back(full_text);
}
// In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first.
PAGE_RES* page_res = find_segmentation ?
NULL : SetupApplyBoxes(boxes, block_list);
clear_any_old_text(block_list);
for (int i = 0; i < boxes.size() - 1; i++) {
bool foundit = false;
if (page_res != NULL) {
if (i == 0) {
foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
full_texts[i].string());
} else {
foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
boxes[i + 1], full_texts[i].string());
}
} else {
foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
texts[i].string());
}
if (!foundit) {
box_failures++;
ReportFailedBox(i, boxes[i], texts[i].string(),
"FAILURE! Couldn't find a matching blob");
}
}
if (page_res == NULL) {
// In word/line mode, we now maximally chop all the words and resegment
// them with the classifier.
page_res = SetupApplyBoxes(boxes, block_list);
}
if (applybox_debug > 0) {
tprintf("APPLY_BOXES:\n");
tprintf(" Boxes read from boxfile: %6d\n", box_count);
if (box_failures > 0)
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
}
TidyUp(page_res);
return page_res;
}
void tesseract::Tesseract::ApplyBoxTraining ( const STRING filename,
PAGE_RES page_res 
)

Definition at line 786 of file applybox.cpp.

{
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(filename.string(), NULL, word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
}
int tesseract::Tesseract::AutoPageSeg ( bool  single_column,
bool  osd,
bool  only_osd,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If single_column is true, then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 218 of file pagesegmain.cpp.

{
WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
}
Pix* photomask_pix = NULL;
Pix* musicmask_pix = NULL;
// The blocks made by the ColumnFinder. Moved to blocks before return.
BLOCK_LIST found_blocks;
TO_BLOCK_LIST temp_blocks;
ColumnFinder* finder = SetupPageSegAndDetectOrientation(
single_column, osd, only_osd, blocks, osd_tess, osr,
&temp_blocks, &photomask_pix, &musicmask_pix);
if (finder != NULL) {
TO_BLOCK_IT to_block_it(&temp_blocks);
TO_BLOCK* to_block = to_block_it.data();
if (musicmask_pix != NULL) {
// TODO(rays) pass the musicmask_pix into FindBlocks and mark music
// blocks separately. For now combine with photomask_pix.
pixOr(photomask_pix, photomask_pix, musicmask_pix);
}
if (equ_detect_) {
finder->SetEquationDetect(equ_detect_);
}
if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_,
to_block, photomask_pix,
&found_blocks, to_blocks) < 0) {
pixDestroy(&photomask_pix);
pixDestroy(&musicmask_pix);
return -1;
}
finder->GetDeskewVectors(&deskew_, &reskew_);
delete finder;
}
pixDestroy(&photomask_pix);
pixDestroy(&musicmask_pix);
blocks->clear();
BLOCK_IT block_it(blocks);
// Move the found blocks to the input/output blocks.
block_it.add_list_after(&found_blocks);
// The debug image is no longer needed so delete it.
unlink(AlignedBlob::textord_debug_pix().string());
}
return 0;
}
Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 181 of file tesseractclass.h.

{
return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
}
void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 419 of file control.cpp.

{
PAGE_RES_IT word_it(page_res);
WERD_RES *w_prev = NULL;
WERD_RES *w = word_it.word();
while (1) {
w_prev = w;
while (word_it.forward() != NULL &&
(!word_it.word() || word_it.word()->part_of_combo)) {
// advance word_it, skipping over parts of combos
}
if (!word_it.word()) break;
w = word_it.word();
if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
continue;
}
if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
tprintf("Skipping because one of the words is W_REP_CHAR\n");
}
continue;
}
// Two words sharing the same language model, excellent!
if (w->alt_choices.empty()) {
tprintf("Alt choices not set up for word choice: %s\n",
}
continue;
}
if (w_prev->alt_choices.empty()) {
tprintf("Alt choices not set up for word choice: %s\n",
}
continue;
}
// We saved alternate choices, excellent!
GenericVector<GenericVector<int> *> overrides_word1_state;
GenericVector<GenericVector<int> *> overrides_word2_state;
STRING orig_w1_str = w_prev->best_choice->unichar_string();
STRING orig_w2_str = w->best_choice->unichar_string();
WERD_CHOICE prev_best(w->uch_set);
{
int w1start, w1end;
w_prev->WithoutFootnoteSpan(&w1start, &w1end);
prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
}
WERD_CHOICE this_best(w->uch_set);
{
int w2start, w2end;
w->WithoutFootnoteSpan(&w2start, &w2end);
this_best = w->best_choice->shallow_copy(w2start, w2end);
}
if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
tprintf("Top choice \"%s %s\" verified by bigram model.\n",
orig_w1_str.string(), orig_w2_str.string());
}
continue;
}
tprintf("Examining alt choices for \"%s %s\".\n",
orig_w1_str.string(), orig_w2_str.string());
}
if (w_prev->alt_choices.size() > 1) {
}
if (w->alt_choices.size() > 1) {
}
}
float best_rating = 0.0;
int best_idx = 0;
for (int i = 0; i < w_prev->alt_choices.size(); i++) {
WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
WERD_CHOICE strip1(w->uch_set);
{
int p1start, p1end;
w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
&p1start, &p1end);
strip1 = p1->shallow_copy(p1start, p1end);
}
for (int j = 0; j < w->alt_choices.size(); j++) {
WERD_CHOICE strip2(w->uch_set);
{
int p2start, p2end;
w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
strip2 = p2->shallow_copy(p2start, p2end);
}
if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
overrides_word1.push_back(p1);
overrides_word1_state.push_back(&w_prev->alt_states.get(i));
overrides_word2.push_back(p2);
overrides_word2_state.push_back(&w->alt_states.get(j));
if (overrides_word1.size() == 1 ||
p1->rating() + p2->rating() < best_rating) {
best_rating = p1->rating() + p2->rating();
best_idx = overrides_word1.size() - 1;
}
}
}
}
if (overrides_word1.size() >= 1) {
// Excellent, we have some bigram matches.
*overrides_word1[best_idx]) &&
*overrides_word2[best_idx])) {
tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
"model.\n", orig_w1_str.string(), orig_w2_str.string());
}
continue;
}
STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
if (new_w1_str != orig_w1_str) {
w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
*overrides_word1_state[best_idx]);
}
if (new_w2_str != orig_w2_str) {
w->ReplaceBestChoice(*overrides_word2[best_idx],
*overrides_word2_state[best_idx]);
}
STRING choices_description;
int num_bigram_choices
= overrides_word1.size() * overrides_word2.size();
if (num_bigram_choices == 1) {
choices_description = "This was the unique bigram choice.";
} else {
STRING bigrams_list;
const int kMaxChoicesToPrint = 20;
for (int i = 0; i < overrides_word1.size() &&
i < kMaxChoicesToPrint; i++) {
if (i > 0) { bigrams_list += ", "; }
WERD_CHOICE *p1 = overrides_word1[i];
WERD_CHOICE *p2 = overrides_word2[i];
bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
if (i == kMaxChoicesToPrint) {
bigrams_list += " ...";
}
}
choices_description = "There were many choices: {";
choices_description += bigrams_list;
choices_description += "}";
} else {
choices_description.add_str_int("There were ", num_bigram_choices);
choices_description += " compatible bigrams.";
}
}
tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
orig_w1_str.string(), orig_w2_str.string(),
new_w1_str.string(), new_w2_str.string(),
choices_description.string());
}
}
}
}
void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 684 of file control.cpp.

{
if (!wordrec_run_blamer) return;
PAGE_RES_IT page_res_it(page_res);
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
if (word->blamer_bundle == NULL) {
word->blamer_bundle = new BlamerBundle();
word->blamer_bundle->debug += " to blame";
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
} else {
bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
if (irr == IRR_CORRECT && !correct) {
STRING debug = "Choice is incorrect after recognition";
word->best_choice,
} else if (irr != IRR_CORRECT && correct) {
tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
}
word->blamer_bundle->debug = "";
}
}
}
tprintf("Blame reasons:\n");
for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
static_cast<IncorrectResultReason>(bl)),
page_res->blame_reasons[bl]);
}
if (page_res->misadaption_log.length() > 0) {
tprintf("Misadaption log:\n");
for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
tprintf("%s\n", page_res->misadaption_log[i].string());
}
}
}
float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 844 of file fixspace.cpp.

{
TBOX box; // BB of outline
inT16 outline_count = 0;
inT16 max_dimension;
inT16 largest_outline_dimension = 0;
for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width()) {
max_dimension = box.height();
} else {
max_dimension = box.width();
}
if (largest_outline_dimension < max_dimension)
largest_outline_dimension = max_dimension;
}
if (outline_count > 5) {
// penalise LOTS of blobs
largest_outline_dimension *= 2;
}
box = blob->bounding_box();
if (box.bottom() > kBlnBaselineOffset * 4 ||
box.top() < kBlnBaselineOffset / 2) {
// Lax blob is if high or low
largest_outline_dimension /= 2;
}
return largest_outline_dimension;
}
void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 699 of file fixspace.cpp.

{
WERD_RES_IT word_it(&words);
WERD_RES_IT worst_word_it;
float worst_noise_score = 9999;
int worst_blob_index = -1; // Noisiest blob of noisiest wd
int blob_index; // of wds noisiest blob
float noise_score; // of wds noisiest blob
WERD_RES *word_res;
C_BLOB_IT blob_it;
C_BLOB_IT rej_cblob_it;
C_BLOB_LIST new_blob_list;
C_BLOB_IT new_blob_it;
C_BLOB_IT new_rej_cblob_it;
WERD *new_word;
inT16 start_of_noise_blob;
inT16 i;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
blob_index = worst_noise_blob(word_it.data(), &noise_score);
if (blob_index > -1 && worst_noise_score > noise_score) {
worst_noise_score = noise_score;
worst_blob_index = blob_index;
worst_word_it = word_it;
}
}
if (worst_blob_index < 0) {
words.clear(); // signal termination
return;
}
/* Now split the worst_word_it */
word_res = worst_word_it.data();
/* Move blobs before noise blob to a new bloblist */
new_blob_it.set_to_list(&new_blob_list);
blob_it.set_to_list(word_res->word->cblob_list());
for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
new_blob_it.add_after_then_move(blob_it.extract());
}
start_of_noise_blob = blob_it.data()->bounding_box().left();
delete blob_it.extract(); // throw out noise blob
new_word = new WERD(&new_blob_list, word_res->word);
new_word->set_flag(W_EOL, FALSE);
word_res->word->set_flag(W_BOL, FALSE);
word_res->word->set_blanks(1); // After break
new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
for (;
(!rej_cblob_it.empty() &&
(rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
rej_cblob_it.forward()) {
new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
}
WERD_RES* new_word_res = new WERD_RES(new_word);
new_word_res->combination = TRUE;
worst_word_it.add_before_then_move(new_word_res);
word_res->ClearResults();
}
SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 256 of file pgedit.cpp.

{
SVMenuNode* parent_menu;
SVMenuNode* root_menu_item = new SVMenuNode();
SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
parent_menu = root_menu_item->AddChild("DISPLAY");
parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
parent_menu = root_menu_item->AddChild("OTHER");
parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
return root_menu_item;
}
BOOL8 tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1388 of file control.cpp.

{
BOOL8 show_map_detail = FALSE;
inT16 i;
#ifndef SECURE_NAMES
if (!test_pt)
return FALSE;
debug_x_ht_level.set_value (0);
if (location < 0)
return TRUE; // For breakpoint use
debug_x_ht_level.set_value (20);
tprintf ("\n\nTESTWD::");
switch (location) {
case 0:
tprintf ("classify_word_pass1 start\n");
word->word->print();
break;
case 10:
tprintf ("make_reject_map: initial map");
break;
case 20:
tprintf ("make_reject_map: after NN");
break;
case 30:
tprintf ("classify_word_pass2 - START");
break;
case 40:
tprintf ("classify_word_pass2 - Pre Xht");
break;
case 50:
tprintf ("classify_word_pass2 - END");
show_map_detail = TRUE;
break;
case 60:
tprintf ("fixspace");
break;
case 70:
tprintf ("MM pass START");
break;
case 80:
tprintf ("MM pass END");
break;
case 90:
tprintf ("After Poor quality rejection");
break;
case 100:
tprintf ("unrej_good_quality_words - START");
break;
case 110:
tprintf ("unrej_good_quality_words - END");
break;
case 120:
tprintf ("Write results pass");
show_map_detail = TRUE;
break;
}
tprintf(" \"%s\" ",
tprintf ("\n");
if (show_map_detail) {
tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
}
}
tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
return TRUE;
}
else
#endif
return FALSE;
}
void tesseract::Tesseract::classify_word_and_language ( WordRecognizer  recognizer,
BLOCK block,
ROW row,
WERD_RES word 
)

Definition at line 795 of file control.cpp.

{
tprintf("Processing word with lang %s at:",
most_recently_used_->lang.string());
word->word->bounding_box().print();
}
const char* result_type = "Initial";
bool initially_done = !word->tess_failed && word->done;
if (initially_done) {
// If done on pass1, we reuse the tesseract that did it, and don't try
// any more. The only need to call the classifier at all is for the
// cube combiner and xheight fixing (which may be bogus on a done word.)
most_recently_used_ = word->tesseract;
result_type = "Already done";
}
(most_recently_used_->*recognizer)(block, row, word);
if (!word->tess_failed && word->tess_accepted)
result_type = "Accepted";
tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
result_type,
word->best_choice->rating(),
}
if (word->tess_failed || !word->tess_accepted) {
// Try all the other languages to see if they are any better.
Tesseract* previous_used = most_recently_used_;
if (most_recently_used_ != this) {
tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
}
if (RetryWithLanguage(word, block, row, recognizer)) {
most_recently_used_ = this;
if (!word->tess_failed && word->tess_accepted)
return; // No need to look at the others.
}
}
for (int i = 0; i < sub_langs_.size(); ++i) {
if (sub_langs_[i] != previous_used) {
tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
i, sub_langs_[i]->lang.string());
}
if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
most_recently_used_ = sub_langs_[i];
if (!word->tess_failed && word->tess_accepted)
return; // No need to look at the others.
}
}
}
}
}
void tesseract::Tesseract::classify_word_pass1 ( BLOCK block,
ROW row,
WERD_RES word 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 860 of file control.cpp.

{
// If we only intend to run cube - run it and return.
cube_word_pass1(block, row, word);
return;
}
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
BOOL8 adapt_ok;
const char *rejmap;
inT16 index;
STRING mapstr = "";
check_debug_pt(word, 0);
row, block))
tess_segment_pass1(word, blob_choices);
if (!word->tess_failed) {
/*
The adaption step used to be here. It has been moved to after
make_reject_map so that we know whether the word will be accepted in the
first pass or not. This move will PREVENT adaption to words containing
double quotes because the word will not be identical to what tess thinks
its best choice is. (See CurrentBestChoiceIs in
stopper.cpp which is used by AdaptableWord in
adaptmatch.cpp)
*/
if (!word->word->flag(W_REP_CHAR)) {
// TODO(daria) delete these hacks when replaced by more generic code.
// Convert '' (double single) to " (single double).
word->fix_quotes(blob_choices);
if (tessedit_fix_hyphens) // turn -- to -
word->fix_hyphens(blob_choices);
word->raw_choice);
word->tess_would_adapt = word->best_choice && word->raw_choice &&
*word->best_choice,
*word->raw_choice);
// Also sets word->done flag
make_reject_map(word, blob_choices, row, 1);
if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
rejmap = NULL;
} else {
word->best_choice->length());
for (index = 0; index < word->reject_map.length(); index++) {
if (adapt_ok || word->reject_map[index].accepted())
mapstr += '1';
else
mapstr += '0';
}
rejmap = mapstr.string();
}
// Send word to adaptive classifier for training.
set_word_fonts(word, blob_choices);
LearnWord(NULL, rejmap, word);
// Mark misadaptions if running blamer.
if (word->blamer_bundle != NULL &&
word->blamer_bundle->misadaption_debug ="misadapt to word (";
}
}
}
}
}
// Save best choices in the WERD_CHOICE if needed
word->best_choice->set_blob_choices(blob_choices);
}
void tesseract::Tesseract::classify_word_pass2 ( BLOCK block,
ROW row,
WERD_RES word 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1026 of file control.cpp.

{
// Return if we do not want to run Tesseract.
return;
bool done_this_pass = false;
check_debug_pt(word, 30);
if (!word->done || tessedit_training_tess) {
word->caps_height = 0.0;
if (word->x_height == 0.0f)
word->x_height = row->x_height();
match_word_pass2(word, row, block);
done_this_pass = TRUE;
check_debug_pt(word, 40);
}
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
bool accept_new_xht = false;
// Use the tops and bottoms since they are available.
accept_new_xht = TrainedXheightFix(word, block, row);
}
if (accept_new_xht)
done_this_pass = true;
// Test for small caps. Word capheight must be close to block xheight,
// and word must contain no lower case letters, and at least one upper case.
double small_cap_xheight = block->x_height() * kXHeightCapRatio;
double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0;
small_cap_xheight - small_cap_delta <= word->x_height &&
word->x_height <= small_cap_xheight + small_cap_delta) {
// Scan for upper/lower.
int num_upper = 0;
int num_lower = 0;
for (int i = 0; i < word->best_choice->length(); ++i) {
++num_upper;
++num_lower;
}
if (num_upper > 0 && num_lower == 0)
word->small_caps = true;
}
}
#ifndef GRAPHICS_DISABLED
if (fx_win == NULL)
TBOX wbox = word->rebuild_word->bounding_box();
fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
wbox.right(), wbox.bottom());
}
#endif
check_debug_pt(word, 50);
}
void tesseract::Tesseract::Clear ( )

Definition at line 413 of file tesseractclass.cpp.

{
pixDestroy(&pix_binary_);
pixDestroy(&cube_binary_);
pixDestroy(&pix_grey_);
pixDestroy(&scaled_color_);
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
splitter_.Clear();
scaled_factor_ = -1;
for (int i = 0; i < sub_langs_.size(); ++i)
sub_langs_[i]->Clear();
}
float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res)

Definition at line 96 of file fixxht.cpp.

{
STATS top_stats(0, MAX_UINT8);
TBLOB* blob = word_res->rebuild_word->blobs;
int blob_id = 0;
for (; blob != NULL; blob = blob->next, ++blob_id) {
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();
// Clip the top to the limit of normalized feature space.
if (top >= INT_FEAT_RANGE)
top = INT_FEAT_RANGE - 1;
int bottom = blob->bounding_box().bottom();
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
&min_top, &max_top);
// Chars with a wild top range would mess up the result so ignore them.
if (max_top - min_top > kMaxCharTopRange)
continue;
int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
top - (max_top + x_ht_acceptance_tolerance));
int height = top - kBlnBaselineOffset;
if (debug_x_ht_level >= 20) {
tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
height, min_bottom, max_bottom, min_top, max_top,
bottom, top);
}
// Use only chars that fit in the expected bottom range, and where
// the range of tops is sensibly near the xheight.
if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
bottom - x_ht_acceptance_tolerance <= max_bottom &&
min_top > kBlnBaselineOffset &&
max_top - kBlnBaselineOffset >= kBlnXHeight &&
misfit_dist > 0) {
// Compute the x-height position using proportionality between the
// actual height and expected height.
int min_xht = DivRounded(height * kBlnXHeight,
max_top - kBlnBaselineOffset);
int max_xht = DivRounded(height * kBlnXHeight,
min_top - kBlnBaselineOffset);
if (debug_x_ht_level >= 20) {
tprintf(" xht range min=%d, max=%d\n",
min_xht, max_xht);
}
// The range of expected heights gets a vote equal to the distance
// of the actual top from the expected top.
for (int y = min_xht; y <= max_xht; ++y)
top_stats.add(y, misfit_dist);
} else if (debug_x_ht_level >= 20) {
tprintf(" already OK\n");
}
}
}
if (top_stats.get_total() == 0)
return 0.0f;
// The new xheight is just the median vote, which is then scaled out
// of BLN space back to pixel space to get the x-height in pixel space.
float new_xht = top_stats.median();
if (debug_x_ht_level >= 20) {
tprintf("Median xht=%f\n", new_xht);
tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
new_xht, new_xht / word_res->denorm.y_scale());
}
// The xheight must change by at least x_ht_min_change to be used.
if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
return new_xht / word_res->denorm.y_scale();
else
return 0.0f;
}
void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 666 of file docqual.cpp.

{
int i;
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
bool modified = false;
for (i = 0; i < word_res->reject_map.length(); ++i) {
if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
word_res->best_choice->set_unichar_id(unichar_dash, i);
modified = true;
if (word_res->reject_map[i].accepted ())
word_res->reject_map[i].setrej_unlv_rej ();
}
if (word_res->best_choice->unichar_id(i) == unichar_pow) {
word_res->best_choice->set_unichar_id(unichar_space, i);
modified = true;
if (word_res->reject_map[i].accepted ())
word_res->reject_map[i].setrej_unlv_rej ();
}
}
}
bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Definition at line 536 of file applybox.cpp.

{
for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char* next_space = strchr(utf8, ' ');
if (next_space == NULL)
next_space = utf8 + strlen(utf8);
step = next_space - utf8;
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
if (class_id == INVALID_UNICHAR_ID) {
return false;
}
while (utf8[step] == ' ')
++step;
class_ids->push_back(class_id);
}
return true;
}
void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Definition at line 764 of file applybox.cpp.

{
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
word_res->correct_text.size());
for (int i = 0; i < word_res->correct_text.size(); ++i) {
// The part before the first space is the real ground truth, and the
// rest is the bounding box location and page number.
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
}
if (word_res->best_choice != NULL)
delete word_res->best_choice;
word_res->best_choice = choice;
}
}
inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 474 of file output.cpp.

{
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
word.unicharset()->get_isdigit(word.unichar_id(i)))
count++;
}
return count;
}
inT16 tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 737 of file reject.cpp.

{
int count = 0;
const WERD_CHOICE *best_choice = word_res->best_choice;
for (int i = 0; i < word_res->reject_map.length(); ++i) {
if ((word_res->reject_map[i].accepted()) &&
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
count++;
}
}
return count;
}
inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 464 of file output.cpp.

{
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i)))
count++;
}
return count;
}
inT16 tesseract::Tesseract::count_outline_errs ( char  c,
inT16  outline_count 
)

Definition at line 131 of file docqual.cpp.

{
int expected_outline_count;
if (STRING (outlines_odd).contains (c))
return 0; //Dont use this char
else if (STRING (outlines_2).contains (c))
expected_outline_count = 2;
else
expected_outline_count = 1;
return abs (outline_count - expected_outline_count);
}
int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 64 of file fixxht.cpp.

{
int bad_blobs = 0;
TBLOB* blob = word_res->rebuild_word->blobs;
int blob_id = 0;
for (; blob != NULL; blob = blob->next, ++blob_id) {
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();
if (top >= INT_FEAT_RANGE)
top = INT_FEAT_RANGE - 1;
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
&min_top, &max_top);
if (max_top - min_top > kMaxCharTopRange)
continue;
bool bad = top < min_top - x_ht_acceptance_tolerance ||
top > max_top + x_ht_acceptance_tolerance;
if (bad)
++bad_blobs;
if (debug_x_ht_level >= 1) {
tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
bad ? "Misfit" : "OK", top, min_top, max_top,
static_cast<int>(x_ht_acceptance_tolerance));
}
}
}
return bad_blobs;
}
bool tesseract::Tesseract::create_cube_box_word ( Boxa *  char_boxes,
int  num_chars,
TBOX  word_box,
BoxWord box_word 
)

Definition at line 116 of file cube_control.cpp.

{
if (!box_word) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
}
return false;
}
// Find the x-coordinate of left-most char_box, which could be
// nonzero if the word image was padded before recognition took place.
int x_offset = -1;
for (int i = 0; i < num_chars; ++i) {
Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
if (x_offset < 0 || char_box->x < x_offset) {
x_offset = char_box->x;
}
boxDestroy(&char_box);
}
for (int i = 0; i < num_chars; ++i) {
Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
boxDestroy(&char_box);
box_word->InsertBox(i, tbox);
}
return true;
}
void tesseract::Tesseract::cube_combine_word ( CubeObject cube_obj,
WERD_RES cube_word,
WERD_RES tess_word 
)

Definition at line 323 of file cube_control.cpp.

{
float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
cube_obj);
// If combiner probability is greater than tess/cube combiner
// classifier threshold, i.e. tesseract wins, then just return the
// tesseract result unchanged, as the combiner knows nothing about how
// correct the answer is. If cube and tesseract agree, then improve the
// scores before returning.
WERD_CHOICE* tess_best = tess_word->best_choice;
WERD_CHOICE* cube_best = cube_word->best_choice;
tprintf("Combiner prob = %g vs threshold %g\n",
combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
}
if (combiner_prob >=
cube_cntxt_->Params()->CombinerClassifierThresh()) {
if (tess_best->unichar_string() == cube_best->unichar_string()) {
// Cube and tess agree, so improve the scores.
tess_best->set_rating(tess_best->rating() / 2);
tess_best->set_certainty(tess_best->certainty() / 2);
}
return;
}
// Cube wins.
// It is better for the language combiner to have all tesseract scores,
// so put them in the cube result.
cube_best->set_rating(tess_best->rating());
cube_best->set_certainty(tess_best->certainty());
tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
tess_best->unichar_string().string(),
cube_best->unichar_string().string());
}
tess_word->ConsumeWordResults(cube_word);
}
bool tesseract::Tesseract::cube_recognize ( CubeObject cube_obj,
BLOCK block,
WERD_RES word 
)

Definition at line 366 of file cube_control.cpp.

{
if (!word->SetupForCubeRecognition(unicharset, this, block)) {
return false; // Graphics block.
}
// Run cube
WordAltList *cube_alt_list = cube_obj->RecognizeWord();
if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
if (cube_debug_level > 0) {
tprintf("Cube returned nothing for word at:");
word->word->bounding_box().print();
}
return false;
}
// Get cube's best result and its probability, mapped to tesseract's
// certainty range
char_32 *cube_best_32 = cube_alt_list->Alt(0);
double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
string cube_best_str;
CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);
// Retrieve Cube's character bounding boxes and CharSamples,
// corresponding to the most recent call to RecognizeWord().
Boxa *char_boxes = NULL;
CharSamp **char_samples = NULL;;
int num_chars;
if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
&& cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
"cube state.\n");
return false;
}
// Convert cube's character bounding boxes to a BoxWord.
BoxWord cube_box_word;
TBOX tess_word_box = word->word->bounding_box();
if (word->denorm.block() != NULL)
tess_word_box.rotate(word->denorm.block()->re_rotation());
bool box_word_success = create_cube_box_word(char_boxes, num_chars,
tess_word_box,
&cube_box_word);
boxaDestroy(&char_boxes);
if (!box_word_success) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
"create cube BoxWord\n");
}
return false;
}
// Create cube's best choice.
WERD_CHOICE* cube_werd_choice = create_werd_choice(
char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
unicharset, cube_cntxt_->CharacterSet());
delete []char_samples;
if (!cube_werd_choice) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
"create cube WERD_CHOICE\n");
}
word->SetupFake(unicharset);
return false;
}
tprintf("Cube result: %s r=%g, c=%g\n",
cube_werd_choice->unichar_string().string(),
cube_werd_choice->rating(),
cube_werd_choice->certainty());
}
// Fill tesseract result's fields with cube results
fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
return true;
}
CubeObject * tesseract::Tesseract::cube_recognize_word ( BLOCK block,
WERD_RES word 
)

Definition at line 286 of file cube_control.cpp.

{
if (!cube_binary_ || !cube_cntxt_) {
if (cube_debug_level > 0 && !cube_binary_)
tprintf("Tesseract::run_cube(): NULL binary image.\n");
return NULL;
}
TBOX word_box = word->word->bounding_box();
if (block != NULL && (block->re_rotation().x() != 1.0f ||
block->re_rotation().y() != 0.0f)) {
// TODO(rays) We have to rotate the bounding box to get the true coords.
// This will be achieved in the future via DENORM.
// In the mean time, cube can't process this word.
if (cube_debug_level > 0) {
tprintf("Cube can't process rotated word at:");
word_box.print();
}
return NULL;
}
CubeObject* cube_obj = new tesseract::CubeObject(
cube_cntxt_, cube_binary_, word_box.left(),
pixGetHeight(cube_binary_) - word_box.top(),
word_box.width(), word_box.height());
if (!cube_recognize(cube_obj, block, word)) {
delete cube_obj;
return NULL;
}
return cube_obj;
}
void tesseract::Tesseract::cube_word_pass1 ( BLOCK block,
ROW row,
WERD_RES word 
)

Definition at line 275 of file cube_control.cpp.

{
CubeObject *cube_obj = cube_recognize_word(block, word);
delete cube_obj;
}
void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 636 of file pgedit.cpp.

{
recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0);
}
BOOL8 tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 344 of file fixspace.cpp.

{
int i;
int offset;
for (i = 0, offset = 0; i < char_position;
offset += word->best_choice->unichar_lengths()[i++]);
return (
word->best_choice->unichar_string().string() + offset,
word->best_choice->unichar_lengths()[i]) ||
(word->best_choice->permuter() == NUMBER_PERM &&
word->best_choice->unichar_string().string()[offset])));
}
void tesseract::Tesseract::do_re_display ( BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res)  word_painter)

do_re_display()

Redisplay page

Definition at line 306 of file pgedit.cpp.

{
int block_count = 1;
if (display_image != 0) {
image_win->Image(pix_binary_, 0, 0);
}
for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
(this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word);
if (display_baselines && pr_it.row() != pr_it.prev_row())
pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
if (display_blocks && pr_it.block() != pr_it.prev_block())
pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED);
}
}
void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 238 of file docqual.cpp.

{
inT16 block_no = 0;
inT16 row_no = 0;
BLOCK_RES *current_block;
ROW_RES *current_row;
BOOL8 rej_word;
BOOL8 prev_word_rejected;
inT16 char_quality = 0;
inT16 accepted_char_quality;
if (page_res_it.page_res->rej_count * 100.0 /
reject_whole_page(page_res_it);
tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
}
} else {
tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
}
/* Walk blocks testing for block rejection */
page_res_it.restart_page();
WERD_RES* word;
while ((word = page_res_it.word()) != NULL) {
current_block = page_res_it.block();
block_no = current_block->block->index();
if (current_block->char_count > 0 &&
(current_block->rej_count * 100.0 / current_block->char_count) >
tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
block_no, current_block->char_count,
current_block->rej_count);
}
prev_word_rejected = FALSE;
while ((word = page_res_it.word()) != NULL &&
(page_res_it.block() == current_block)) {
rej_word = word->reject_map.reject_count() > 0 ||
if (rej_word && tessedit_dont_blkrej_good_wds &&
*word->uch_set,
word_char_quality(word, page_res_it.row()->row,
&char_quality,
&accepted_char_quality);
rej_word = char_quality != word->reject_map.length();
}
} else {
rej_word = TRUE;
}
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried this
generated more space errors.
*/
prev_word_rejected &&
page_res_it.prev_row() == page_res_it.row() &&
word->word->space() == 1)
}
prev_word_rejected = rej_word;
page_res_it.forward();
}
} else {
tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
block_no, page_res_it.block()->char_count,
page_res_it.block()->rej_count);
}
/* Walk rows in block testing for row rejection */
row_no = 0;
while ((word = page_res_it.word()) != NULL &&
page_res_it.block() == current_block) {
current_row = page_res_it.row();
row_no++;
/* Reject whole row if:
fraction of chars on row which are rejected exceed a limit AND
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
limit
*/
if (current_row->char_count > 0 &&
(current_row->rej_count * 100.0 / current_row->char_count) >
(current_row->whole_word_rej_count * 100.0 /
current_row->rej_count) <
tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
row_no, current_row->char_count,
current_row->rej_count);
}
prev_word_rejected = FALSE;
while ((word = page_res_it.word()) != NULL &&
page_res_it.row () == current_row) {
/* Preserve words on good docs unless they are mostly rejected*/
if (!tessedit_row_rej_good_docs && good_quality_doc) {
rej_word = word->reject_map.reject_count() /
static_cast<float>(word->reject_map.length()) >
/* Preserve perfect words anyway */
rej_word = word->reject_map.reject_count() > 0 ||
if (rej_word && tessedit_dont_rowrej_good_wds &&
word_char_quality(word, page_res_it.row()->row,
&char_quality,
&accepted_char_quality);
rej_word = char_quality != word->reject_map.length();
}
} else {
rej_word = TRUE;
}
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried
this generated more space errors.
*/
prev_word_rejected &&
page_res_it.prev_row() == page_res_it.row() &&
word->word->space () == 1)
}
prev_word_rejected = rej_word;
page_res_it.forward();
}
} else {
tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
row_no, current_row->char_count, current_row->rej_count);
}
while (page_res_it.word() != NULL &&
page_res_it.row() == current_row)
page_res_it.forward();
}
}
}
}
}
}
void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 705 of file reject.cpp.

{
int i = 0;
int offset;
int word_len = word->reject_map.length();
const char *s = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
BOOL8 accepted_1Il = FALSE;
for (i = 0, offset = 0; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted()) {
if (STRING(conflict_set_I_l_1).contains(s[offset])) {
accepted_1Il = TRUE;
} else {
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
word->uch_set->get_isdigit(s + offset, lengths[i]))
return; // >=1 non 1Il ch accepted
}
}
}
if (!accepted_1Il)
return; //Nothing to worry about
for (i = 0, offset = 0; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
word->reject_map[i].accepted())
word->reject_map[i].setrej_postNN_1Il();
}
}
void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
inT16  score,
inT16  mode,
BOOL8  improved 
)

Definition at line 450 of file fixspace.cpp.

{
WERD_RES_IT word_res_it(&perm);
if (mode == 1) {
stats_.dump_words_str = "";
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
stats_.dump_words_str +=
word_res_it.data()->best_choice->unichar_string();
stats_.dump_words_str += ' ';
}
}
}
#ifndef SECURE_NAMES
switch (mode) {
case 1:
tprintf("EXTRACTED (%d): \"", score);
break;
case 2:
tprintf("TESTED (%d): \"", score);
break;
case 3:
tprintf("RETURNED (%d): \"", score);
break;
}
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
(int)word_res_it.data()->best_choice->permuter());
}
}
tprintf("\"\n");
} else if (improved) {
tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
(int)word_res_it.data()->best_choice->permuter());
}
}
tprintf("\"\n");
}
#endif
}
}
void tesseract::Tesseract::end_tesseract ( )

Definition at line 431 of file tessedit.cpp.

{
}
inT16 tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 240 of file fixspace.cpp.

{
WERD_RES_IT word_res_it(&word_res_list);
inT16 total_score = 0;
inT16 word_count = 0;
inT16 done_word_count = 0;
inT16 word_len;
inT16 i;
inT16 offset;
WERD_RES *word; // current word
inT16 prev_word_score = 0;
BOOL8 prev_word_done = FALSE;
BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
BOOL8 current_char_1 = FALSE;
BOOL8 current_word_ok_so_far;
STRING punct_chars = "!\"`',.:;";
BOOL8 prev_char_punct = FALSE;
BOOL8 current_char_punct = FALSE;
BOOL8 word_done = FALSE;
do {
word = word_res_it.data();
word_done = fixspace_thinks_word_done(word);
word_count++;
if (word->tess_failed) {
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
prev_word_score = 0;
prev_char_1 = FALSE;
prev_char_digit = FALSE;
prev_word_done = FALSE;
} else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didnt end in a 1 when the first char of this word is a digit
AND it didnt end in a digit when the first char of this word is a 1
*/
word_len = word->reject_map.length();
current_word_ok_so_far = FALSE;
if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
(prev_char_digit && (
(word_done &&
word->best_choice->unichar_lengths().string()[0] == 1 &&
word->best_choice->unichar_string()[0] == '1') ||
word->best_choice->unichar_string()[0])))))) {
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
current_word_ok_so_far = word_done;
}
if (current_word_ok_so_far) {
prev_word_done = TRUE;
prev_word_score = word_len;
} else {
prev_word_done = FALSE;
prev_word_score = 0;
}
/* Add 1 to total score for every joined 1 regardless of context and
rejtn */
for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
current_char_1 = word->best_choice->unichar_string()[i] == '1';
if (prev_char_1 || (current_char_1 && (i > 0)))
total_score++;
prev_char_1 = current_char_1;
}
/* Add 1 to total score for every joined punctuation regardless of context
and rejtn */
for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
current_char_punct =
punct_chars.contains(word->best_choice->unichar_string()[offset]);
if (prev_char_punct || (current_char_punct && i > 0))
total_score++;
prev_char_punct = current_char_punct;
}
}
prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
for (i = 0, offset = 0; i < word_len - 1;
offset += word->best_choice->unichar_lengths()[i++]);
prev_char_1 =
((word_done && (word->best_choice->unichar_string()[offset] == '1'))
|| (!word_done && STRING(conflict_set_I_l_1).contains(
word->best_choice->unichar_string()[offset])));
}
/* Find next word */
do {
word_res_it.forward();
} while (word_res_it.data()->part_of_combo);
} while (!word_res_it.at_first());
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
if (done_word_count == word_count)
return PERFECT_WERDS;
else
return total_score;
}
void tesseract::Tesseract::ExplodeRepeatedWord ( BLOB_CHOICE best_choice,
PAGE_RES_IT page_res_it 
)

Definition at line 1252 of file control.cpp.

{
WERD_RES *word_res = page_res_it->word();
ASSERT_HOST(best_choice != NULL);
// Make a new word for each blob in the original.
WERD* werd = word_res->word;
C_BLOB_IT blob_it(werd->cblob_list());
for (; !blob_it.empty(); blob_it.forward()) {
bool first_blob = blob_it.at_first();
bool last_blob = blob_it.at_last();
WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
blob_it.extract());
// Note that blamer_bundle (truth information) is not copied, which is
// desirable, since the newly inserted words would not have the original
// bounding box corresponding to the one recorded in truth fields.
WERD_RES* rep_word =
page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
// Setup the single char WERD_RES
if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
false,
page_res_it->row()->row,
page_res_it->block()->block)) {
rep_word->CloneChoppedToRebuild();
BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
rep_word->FakeClassifyWord(1, &blob_choice);
}
}
page_res_it->DeleteCurrentWord();
}
bool tesseract::Tesseract::extract_cube_state ( CubeObject cube_obj,
int *  num_chars,
Boxa **  char_boxes,
CharSamp ***  char_samples 
)

Definition at line 65 of file cube_control.cpp.

{
if (!cube_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (extract_cube_state): Invalid cube object "
"passed to extract_cube_state\n");
}
return false;
}
// Note that the CubeObject accessors return either the deslanted or
// regular objects search object or beam search object, whichever
// was used in the last call to Recognize()
CubeSearchObject* cube_search_obj = cube_obj->SrchObj();
if (!cube_search_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
"cube's search object in extract_cube_state.\n");
}
return false;
}
BeamSearch *beam_search_obj = cube_obj->BeamObj();
if (!beam_search_obj) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
"cube's beam search object in extract_cube_state.\n");
}
return false;
}
// Get the character samples and bounding boxes by backtracking
// through the beam search path
int best_node_index = beam_search_obj->BestPresortedNodeIndex();
*char_samples = beam_search_obj->BackTrack(
cube_search_obj, best_node_index, num_chars, NULL, char_boxes);
if (!*char_samples)
return false;
return true;
}
inT16 tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 975 of file docqual.cpp.

{
const char *str = word->best_choice->unichar_string().string();
int tess_rejs = 0;
for (; *str != '\0'; str++) {
if (*str == ' ')
tess_rejs++;
}
return tess_rejs;
}
void tesseract::Tesseract::fill_werd_res ( const BoxWord cube_box_word,
WERD_CHOICE cube_werd_choice,
const char *  cube_best_str,
WERD_RES tess_werd_res 
)

Definition at line 454 of file cube_control.cpp.

{
// Replace tesseract results's best choice with cube's
tess_werd_res->best_choice = cube_werd_choice;
tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
delete tess_werd_res->box_word;
tess_werd_res->box_word = new BoxWord(cube_box_word);
tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
tess_werd_res->word);
// Fill text and remaining fields
tess_werd_res->word->set_text(cube_best_str);
tess_werd_res->tess_failed = FALSE;
tess_werd_res->tess_accepted =
tess_werd_res->raw_choice);
// There is no output word, so we can' call AdaptableWord, but then I don't
// think we need to. Fudge the result with accepted.
tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
// Initialize the reject_map and set it to done, i.e., ignore all of
// tesseract's tests for rejection
tess_werd_res->reject_map.initialise(cube_werd_choice->length());
tess_werd_res->done = tess_werd_res->tess_accepted;
// Some sanity checks
ASSERT_HOST(tess_werd_res->best_choice->length() ==
tess_werd_res->best_choice->blob_choices()->length());
ASSERT_HOST(tess_werd_res->best_choice->length() ==
tess_werd_res->reject_map.length());
}
bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Definition at line 560 of file applybox.cpp.

{
// Classify all required combinations of blobs and save results in choices.
int word_length = word_res->box_word->length();
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST* match_result = classify_piece(
word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
i, i + j - 1, word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
}
choices[i].push_back(match_result);
}
}
// Search the segmentation graph for the target text. Must be an exact
// match. Using wildcards makes it difficult to find the correct
// segmentation even when it is there.
word_res->best_state.clear();
GenericVector<int> search_segmentation;
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
&search_segmentation, &best_rating, &word_res->best_state);
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
delete [] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (int s = 0; s < array_count(word_res->seam_array); ++s) {
SEAM* seam =
reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
if (seam->split1 == NULL) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
} else {
++blob_count;
}
}
word_res->best_state.push_back(blob_count);
if (word_res->best_state.size() != target_text.size()) {
word_res->best_state.clear(); // No good. Original segmentation bad size.
return false;
}
}
word_res->correct_text.clear();
for (int i = 0; i < target_text.size(); ++i) {
STRING(unicharset.id_to_unichar(target_text[i])));
}
return true;
}
inT16 tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 633 of file reject.cpp.

{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i]))
return i;
}
return -1;
}
inT16 tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 646 of file reject.cpp.

{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i]))
return offset;
}
return -1;
}
void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 146 of file fixspace.cpp.

{
inT16 best_score;
WERD_RES_LIST current_perm;
inT16 current_score;
BOOL8 improved = FALSE;
best_score = eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
if (best_score != PERFECT_WERDS)
initialise_search(best_perm, current_perm);
while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = TRUE;
}
if (current_score < PERFECT_WERDS)
transform_to_next_perm(current_perm);
}
dump_words(best_perm, best_score, 3, improved);
}
void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
inT32  word_count,
PAGE_RES page_res 
)

Definition at line 49 of file fixspace.cpp.

{
BLOCK_RES_IT block_res_it;
ROW_RES_IT row_res_it;
WERD_RES_IT word_res_it_from;
WERD_RES_IT word_res_it_to;
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
inT16 new_length;
BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
inT32 word_index; // current word
block_res_it.set_to_list(&page_res->block_res_list);
word_index = 0;
for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
block_res_it.forward()) {
row_res_it.set_to_list(&block_res_it.data()->row_res_list);
for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
row_res_it.forward()) {
word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
while (!word_res_it_from.at_last()) {
word_res = word_res_it_from.data();
while (!word_res_it_from.at_last() &&
!(word_res->combination ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
block_res_it.data()->block);
word_res = word_res_it_from.forward();
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
return;
}
}
if (!word_res_it_from.at_last()) {
word_res_it_to = word_res_it_from;
prevent_null_wd_fixsp =
word_res->word->cblob_list()->empty();
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
word_res_it_to.forward();
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
return;
}
while (!word_res_it_to.at_last () &&
(word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
if (word_res->word->cblob_list()->empty())
prevent_null_wd_fixsp = TRUE;
word_res = word_res_it_to.forward();
}
if (check_debug_pt(word_res, 60))
debug_fix_space_level.set_value(10);
if (word_res->word->cblob_list()->empty())
prevent_null_wd_fixsp = TRUE;
if (prevent_null_wd_fixsp) {
word_res_it_from = word_res_it_to;
} else {
fuzzy_space_words.assign_to_sublist(&word_res_it_from,
&word_res_it_to);
fix_fuzzy_space_list(fuzzy_space_words,
row_res_it.data()->row,
block_res_it.data()->block);
new_length = fuzzy_space_words.length();
word_res_it_from.add_list_before(&fuzzy_space_words);
for (;
!word_res_it_from.at_last() && new_length > 0;
new_length--) {
word_res_it_from.forward();
}
}
if (test_pt)
debug_fix_space_level.set_value(0);
}
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
block_res_it.data()->block);
// Last word in row
}
}
}
}
void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 652 of file fixspace.cpp.

{
inT16 best_score;
WERD_RES_IT best_perm_it(&best_perm);
WERD_RES_LIST current_perm;
WERD_RES_IT current_perm_it(&current_perm);
WERD_RES *old_word_res;
WERD_RES *new_word_res;
inT16 current_score;
BOOL8 improved = FALSE;
best_score = fp_eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
new_word_res = new WERD_RES;
old_word_res = best_perm_it.data();
old_word_res->combination = TRUE; // Kludge to force deep copy
*new_word_res = *old_word_res; // deep copy
old_word_res->combination = FALSE; // Undo kludge
current_perm_it.add_to_end(new_word_res);
break_noisiest_blob_word(current_perm);
while (best_score != PERFECT_WERDS && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = fp_eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = TRUE;
}
if (current_score < PERFECT_WERDS) {
break_noisiest_blob_word(current_perm);
}
}
dump_words(best_perm, best_score, 3, improved);
}
void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1200 of file control.cpp.

{
WERD_RES *word_res = page_res_it->word();
const WERD_CHOICE &word = *(word_res->best_choice);
// Find the frequency of each unique character in the word.
UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
for (int i = 0; i < word.length(); ++i) {
if (word.unichar_id(i) != space)
rep_ch.Add(word.unichar_id(i), 1);
}
// Find the most frequent result.
UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
int max_count = rep_ch.MaxCount(&maxch_id);
// Find the best exemplar of a classifier result for maxch_id.
BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
if (best_choice == NULL) {
tprintf("Failed to find a choice for %s, occurring %d times\n",
word_res->uch_set->debug_str(maxch_id).string(), max_count);
return;
}
word_res->done = TRUE;
// Measure the mean space.
int total_gap = 0;
int gap_count = 0;
WERD* werd = word_res->word;
C_BLOB_IT blob_it(werd->cblob_list());
C_BLOB* prev_blob = blob_it.data();
for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
C_BLOB* blob = blob_it.data();
int gap = blob->bounding_box().left();
gap -= prev_blob->bounding_box().right();
total_gap += gap;
++gap_count;
prev_blob = blob;
}
if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
// Needs spaces between.
ExplodeRepeatedWord(best_choice, page_res_it);
} else {
// Just correct existing classification.
CorrectRepcharChoices(best_choice, word_res);
word_res->reject_map.initialise(word.length());
}
}
void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 618 of file fixspace.cpp.

{
WERD_RES *word_res;
WERD_RES_LIST sub_word_list;
WERD_RES_IT sub_word_list_it(&sub_word_list);
inT16 blob_index;
inT16 new_length;
float junk;
word_res = word_res_it.data();
if (word_res->word->flag(W_REP_CHAR) ||
word_res->combination ||
word_res->part_of_combo ||
!word_res->word->flag(W_DONT_CHOP))
return;
blob_index = worst_noise_blob(word_res, &junk);
if (blob_index < 0)
return;
tprintf("FP fixspace working on \"%s\"\n",
}
word_res->word->rej_cblob_list()->sort(c_blob_comparator);
sub_word_list_it.add_after_stay_put(word_res_it.extract());
fix_noisy_space_list(sub_word_list, row, block);
new_length = sub_word_list.length();
word_res_it.add_list_before(&sub_word_list);
for (; !word_res_it.at_last() && new_length > 1; new_length--) {
word_res_it.forward();
}
}
BOOL8 tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 586 of file fixspace.cpp.

{
if (word->done)
return TRUE;
/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&
(word->tess_accepted ||
(fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
fixsp_done_mode == 3) &&
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter() == FREQ_DAWG_PERM) ||
(word->best_choice->permuter() == USER_DAWG_PERM) ||
(word->best_choice->permuter() == NUMBER_PERM))) {
return TRUE;
} else {
return FALSE;
}
}
void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 856 of file reject.cpp.

{
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
TBOX out_box;
return;
TBLOB* blob = word_res->rebuild_word->blobs;
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
blob = blob->next) {
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
out_box = blob->bounding_box();
if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
(out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
return; //Beware words with sub/superscripts
}
}
UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
if (unichar_0 == INVALID_UNICHAR_ID ||
!word_res->uch_set->get_enabled(unichar_0) ||
unichar_O == INVALID_UNICHAR_ID ||
!word_res->uch_set->get_enabled(unichar_O)) {
return; // 0 or O are not present/enabled in unicharset
}
bool modified = false;
for (i = 1; i < best_choice->length(); ++i) {
if (best_choice->unichar_id(i) == unichar_0 ||
best_choice->unichar_id(i) == unichar_O) {
/* A0A */
if ((i+1) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
best_choice->set_unichar_id(unichar_O, i);
modified = true;
}
/* A00A */
if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(i+1) < best_choice->length() &&
(best_choice->unichar_id(i+1) == unichar_0 ||
best_choice->unichar_id(i+1) == unichar_O) &&
(i+2) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
best_choice->set_unichar_id(unichar_O, i);
modified = true;
i++;
}
/* AA0<non digit or end of word> */
if ((i > 1) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(((i+1) < best_choice->length() &&
!word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
!word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
!word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
(i == best_choice->length() - 1))) {
best_choice->set_unichar_id(unichar_O, i);
modified = true;
}
/* 9O9 */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(i+1) < best_choice->length() &&
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
best_choice->set_unichar_id(unichar_0, i);
modified = true;
}
/* 9OOO */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(i+2) < best_choice->length() &&
(best_choice->unichar_id(i+1) == unichar_0 ||
best_choice->unichar_id(i+1) == unichar_O) &&
(best_choice->unichar_id(i+2) == unichar_0 ||
best_choice->unichar_id(i+2) == unichar_O)) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i+1);
best_choice->set_unichar_id(unichar_0, i+2);
modified = true;
i += 2;
}
/* 9OO<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(i+2) < best_choice->length() &&
(best_choice->unichar_id(i+1) == unichar_0 ||
best_choice->unichar_id(i+1) == unichar_O) &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i+1);
modified = true;
i++;
}
/* 9O<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
(i+1) < best_choice->length() &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
best_choice->set_unichar_id(unichar_0, i);
}
/* 9[.,]OOO.. */
if ((i > 1) &&
(word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
(word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
best_choice->unichar_id(i-2) == unichar_O)) {
if (best_choice->unichar_id(i-2) == unichar_O) {
best_choice->set_unichar_id(unichar_0, i-2);
modified = true;
}
while (i < best_choice->length() &&
(best_choice->unichar_id(i) == unichar_O ||
best_choice->unichar_id(i) == unichar_0)) {
best_choice->set_unichar_id(unichar_0, i);
modified = true;
i++;
}
i--;
}
}
}
}
void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 796 of file reject.cpp.

{
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
int prev_right = -9999;
int next_left;
TBOX out_box;
float aspect_ratio;
return;
TBLOB* blob = word_res->rebuild_word->blobs;
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
bool modified = false;
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
blob = blob->next) {
out_box = blob->bounding_box();
if (blob->next == NULL)
next_left = 9999;
else
next_left = blob->next->bounding_box().left();
// Dont touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
aspect_ratio = out_box.width() / (float) out_box.height();
if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
word_res->uch_set->contains_unichar_id(unichar_dash) &&
word_res->uch_set->get_enabled(unichar_dash)) {
/* Certain HYPHEN */
best_choice->set_unichar_id(unichar_dash, i);
modified = true;
if (word_res->reject_map[i].rejected())
word_res->reject_map[i].setrej_hyphen_accept();
}
if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
word_res->reject_map[i].accepted())
//Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen ();
}
else if (best_choice->unichar_id(i) == unichar_dash) {
if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
(word_res->reject_map[i].rejected()))
word_res->reject_map[i].setrej_hyphen_accept();
//Certain HYPHEN
if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
(word_res->reject_map[i].accepted()))
//Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen();
}
}
prev_right = out_box.right();
}
}
void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 1590 of file control.cpp.

{
PAGE_RES_IT page_res_it(page_res);
WERD_RES *word; // current word
STATS doc_fonts(0, font_table_size_); // font counters
// Gather font id statistics.
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
word = page_res_it.word();
if (word->fontinfo != NULL) {
doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
}
if (word->fontinfo2 != NULL) {
doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
}
}
inT16 doc_font; // modal font
inT8 doc_font_count; // modal font
find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
if (doc_font_count == 0)
return;
// Get the modal font pointer.
const FontInfo* modal_font = NULL;
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
word = page_res_it.word();
if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
modal_font = word->fontinfo;
break;
}
if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
modal_font = word->fontinfo2;
break;
}
}
ASSERT_HOST(modal_font != NULL);
// Assign modal font to weak words.
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
word = page_res_it.word();
int length = word->best_choice->length();
// 1st choices got 2 pts, so we need to halve the score for the mode.
int count = (word->fontinfo_id_count + 1) / 2;
if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
word->fontinfo = modal_font;
// Counts only get 1 as it came from the doc.
word->fontinfo_id_count = 1;
word->italic = modal_font->is_italic() ? 1 : -1;
word->bold = modal_font->is_bold() ? 1 : -1;
}
}
}
inT16 tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 914 of file fixspace.cpp.

{
WERD_RES_IT word_it(&word_res_list);
WERD_RES *word;
inT16 word_length;
inT16 score = 0;
inT16 i;
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if (word->rebuild_word == NULL)
continue; // Can't handle cube words.
word_length = word->reject_map.length();
if (word->done ||
word->tess_accepted ||
word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM ||
safe_dict_word(word) > 0) {
TBLOB* blob = word->rebuild_word->blobs;
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
for (i = 0; i < word->best_choice->length() && blob != NULL;
++i, blob = blob->next) {
if (word->best_choice->unichar_id(i) == space ||
blob_noise_score(blob) < small_limit) {
score -= 1; // penalise possibly erroneous non-space
} else if (word->reject_map[i].accepted()) {
score++;
}
}
}
}
if (score < 0)
score = 0;
return score;
}
GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Definition at line 689 of file docqual.cpp.

{
enum STATES
{
JUNK,
FIRST_UPPER,
FIRST_LOWER,
FIRST_NUM,
SUBSEQUENT_UPPER,
SUBSEQUENT_LOWER,
SUBSEQUENT_NUM
};
const char *str = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
STATES state = JUNK;
int len = 0;
int isolated_digits = 0;
int isolated_alphas = 0;
int bad_char_count = 0;
int tess_rejs = 0;
int dodgy_chars = 0;
int ok_chars;
UNICHAR_ID last_char = -1;
int alpha_repetition_count = 0;
int longest_alpha_repetition_count = 0;
int longest_lower_run_len = 0;
int lower_string_count = 0;
int longest_upper_run_len = 0;
int upper_string_count = 0;
int total_alpha_count = 0;
int total_digit_count = 0;
for (; *str != '\0'; str += *(lengths++)) {
len++;
if (word->uch_set->get_isupper (str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_UPPER:
case FIRST_UPPER:
state = SUBSEQUENT_UPPER;
upper_string_count++;
if (longest_upper_run_len < upper_string_count)
longest_upper_run_len = upper_string_count;
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
}
else {
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
default:
state = FIRST_UPPER;
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
upper_string_count = 1;
break;
}
}
else if (word->uch_set->get_islower (str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_LOWER:
case FIRST_LOWER:
state = SUBSEQUENT_LOWER;
lower_string_count++;
if (longest_lower_run_len < lower_string_count)
longest_lower_run_len = lower_string_count;
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
}
else {
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
default:
state = FIRST_LOWER;
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
lower_string_count = 1;
break;
}
}
else if (word->uch_set->get_isdigit (str, *lengths)) {
total_digit_count++;
switch (state) {
case FIRST_NUM:
state = SUBSEQUENT_NUM;
case SUBSEQUENT_NUM:
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
state = FIRST_NUM;
break;
}
}
else {
if (*lengths == 1 && *str == ' ')
tess_rejs++;
else
bad_char_count++;
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
state = JUNK;
}
}
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
total_alpha_count += total_digit_count - isolated_digits;
}
if (crunch_leave_ok_strings && len >= 4 &&
2 * (total_alpha_count - isolated_alphas) > len &&
longest_alpha_repetition_count < crunch_long_repetitions) {
acceptable_word_string(*word->uch_set, str, lengths) !=
longest_lower_run_len > crunch_leave_lc_strings ||
longest_upper_run_len > crunch_leave_uc_strings)
}
if (word->reject_map.length() > 1 &&
strpbrk(str, " ") == NULL &&
(word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM ||
word->best_choice->permuter() == NUMBER_PERM ||
acceptable_word_string(*word->uch_set, str, lengths) !=
AC_UNACCEPTABLE || ok_dict_word))
return G_OK;
ok_chars = len - bad_char_count - isolated_digits -
isolated_alphas - tess_rejs;
if (crunch_debug > 3) {
tprintf("garbage_word: \"%s\"\n",
tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
len,
bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
}
if (bad_char_count == 0 &&
tess_rejs == 0 &&
(len > isolated_digits + isolated_alphas || len <= 2))
return G_OK;
if (tess_rejs > ok_chars ||
(tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
return G_TERRIBLE;
if (len > 4) {
dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
isolated_alphas;
if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
return G_DODGY;
else
return G_OK;
} else {
dodgy_chars = 2 * tess_rejs + bad_char_count;
if ((len == 4 && dodgy_chars > 2) ||
(len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
return G_DODGY;
else
return G_OK;
}
}
UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 349 of file output.cpp.

{ // what char is repeated?
int i;
for (i = 0; ((i < word->reject_map.length()) &&
(word->reject_map[i].rejected())); ++i);
if (i < word->reject_map.length()) {
return word->best_choice->unichar_id(i);
} else {
return word->uch_set->unichar_to_id(unrecognised_char.string());
}
}
Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 219 of file tesseractclass.h.

{
return sub_langs_[index];
}
CubeRecoContext* tesseract::Tesseract::GetCubeRecoContext ( )
inline

Definition at line 914 of file tesseractclass.h.

{ return cube_cntxt_; }
int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 193 of file tesseractclass.h.

{
return pixGetHeight(pix_binary_);
}
int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 190 of file tesseractclass.h.

{
return pixGetWidth(pix_binary_);
}
bool tesseract::Tesseract::init_cube_objects ( bool  load_combiner,
TessdataManager tessdata_manager 
)

Definition at line 202 of file cube_control.cpp.

{
ASSERT_HOST(cube_cntxt_ == NULL);
ASSERT_HOST(tess_cube_combiner_ == NULL);
// Create the cube context object
if (cube_cntxt_ == NULL) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to "
"instantiate CubeRecoContext\n");
}
return false;
}
// Create the combiner object and load the combiner net for target languages.
if (load_combiner) {
tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_);
if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) {
delete cube_cntxt_;
cube_cntxt_ = NULL;
if (tess_cube_combiner_ != NULL) {
delete tess_cube_combiner_;
tess_cube_combiner_ = NULL;
}
tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n");
return false;
}
}
return true;
}
FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 37 of file recogtraining.cpp.

{
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
save_blob_choices.set_value(1); // save individual char choices
getDict().save_raw_choices.set_value(1); // save raw choices
getDict().permute_only_top.set_value(true); // use only top choice permuter
tessedit_ok_mode.set_value(0); // turn off context checking
// Explore all segmentations.
}
STRING output_fname = fname;
const char *lastdot = strrchr(output_fname.string(), '.');
if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
output_fname += ".txt";
FILE *output_file = open_file(output_fname.string(), "a+");
return output_file;
}
int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 270 of file tessedit.cpp.

{
GenericVector<STRING> langs_to_load;
GenericVector<STRING> langs_not_to_load;
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
sub_langs_.delete_data_pointers();
sub_langs_.clear();
// Find the first loadable lang and load into this.
// Add any languages that this language requires
bool loaded_primary = false;
// Load the rest into sub_langs_.
for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
const char *lang_str = langs_to_load[lang_index].string();
Tesseract *tess_to_init;
if (!loaded_primary) {
tess_to_init = this;
} else {
tess_to_init = new Tesseract;
}
int result = tess_to_init->init_tesseract_internal(
arg0, textbase, lang_str, oem, configs, configs_size,
vars_vec, vars_values, set_only_non_debug_params);
if (!loaded_primary) {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
} else {
tprintf("Loaded language '%s' as main language\n", lang_str);
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
&langs_to_load, &langs_not_to_load);
loaded_primary = true;
}
} else {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
delete tess_to_init;
} else {
tprintf("Loaded language '%s' as secondary language\n", lang_str);
sub_langs_.push_back(tess_to_init);
// Add any languages that this language requires
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
&langs_to_load, &langs_not_to_load);
}
}
}
}
if (!loaded_primary) {
tprintf("Tesseract couldn't load any languages!\n");
return -1; // Couldn't load any language!
}
return 0;
}
int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 352 of file tesseractclass.h.

{
return init_tesseract(datapath, NULL, language, oem,
NULL, 0, NULL, NULL, false);
}
int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 349 of file tessedit.cpp.

{
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
configs_size, vars_vec, vars_values,
set_only_non_debug_params)) {
return -1;
}
return 0;
}
// If only Cube will be used, skip loading Tesseract classifier's
// pre-trained templates.
bool init_tesseract_classifier =
// If only Cube will be used and if it has its own Unicharset,
// skip initializing permuter and loading Tesseract Dawgs.
bool init_dict =
program_editup(textbase, init_tesseract_classifier, init_dict);
return 0; //Normal exit
}
bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 98 of file tessedit.cpp.

{
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
// Set the language data path prefix
lang = language != NULL ? language : "eng";
// Initialize TessdataManager.
STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!tessdata_manager.Init(tessdata_path.string(),
return false;
}
// If a language specific config file (lang.config) exists, load it in.
tprintf("Loaded language config file\n");
}
}
SetParamConstraint set_params_constraint = set_only_non_debug_params ?
// Load tesseract variables from config files. This is done after loading
// language-specific variables from [lang].traineddata file, so that custom
// config files can override values in [lang].traineddata file.
for (int i = 0; i < configs_size; ++i) {
read_config_file(configs[i], set_params_constraint);
}
// Set params specified in vars_vec (done after setting params from config
// files, so that params in vars_vec can override those from files).
if (vars_vec != NULL && vars_values != NULL) {
for (int i = 0; i < vars_vec->size(); ++i) {
if (!ParamUtils::SetParam((*vars_vec)[i].string(),
(*vars_values)[i].string(),
set_params_constraint, this->params())) {
tprintf("Error setting param %s\n", (*vars_vec)[i].string());
exit(1);
}
}
}
if (((STRING &)tessedit_write_params_to_file).length() > 0) {
FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
if (params_file != NULL) {
ParamUtils::PrintParams(params_file, this->params());
fclose(params_file);
tprintf("Wrote parameters to %s\n",
tessedit_write_params_to_file.string());
}
} else {
tprintf("Failed to open %s for writing params.\n",
tessedit_write_params_to_file.string());
}
}
// Determine which ocr engine(s) should be loaded and used for recognition.
if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
static_cast<int>(tessedit_ocr_engine_mode));
}
// If we are only loading the config file (and so not planning on doing any
// recognition) then there's nothing else do here.
tprintf("Returning after loading config file\n");
}
return true;
}
// Load the unicharset
return false;
}
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
return false;
}
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
right_to_left_ = unicharset.major_right_to_left();
if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
}
// Load Cube objects if necessary.
tprintf("Loaded Cube w/out combiner\n");
tprintf("Loaded Cube with combiner\n");
}
return true;
}
int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language 
)

Definition at line 420 of file tessedit.cpp.

{
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
NULL, 0, NULL, NULL, false))
return -1;
return 0;
}
void tesseract::Tesseract::make_reject_map ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices,
ROW row,
inT16  pass 
)
void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 197 of file fixspace.cpp.

{
WERD_RES_IT word_it(&words);
WERD_RES *word;
// Since we are not using PAGE_RES to iterate over words, we need to update
// prev_word_best_choice_ before calling classify_word_pass2().
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if ((!word->part_of_combo) && (word->box_word == NULL)) {
block, row, word);
}
}
}
void tesseract::Tesseract::match_word_pass2 ( WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1098 of file control.cpp.

{
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
row, block))
tess_segment_pass2(word, blob_choices);
if (!word->tess_failed) {
if (!word->word->flag (W_REP_CHAR)) {
word->fix_quotes(blob_choices);
word->fix_hyphens(blob_choices);
/* Dont trust fix_quotes! - though I think I've fixed the bug */
if (word->best_choice->length() != word->box_word->length() ||
word->best_choice->length() != blob_choices->length()) {
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
" #Blobs=%d; #Choices=%d\n",
word->best_choice->length(),
word->box_word->length(), blob_choices->length());
}
word->raw_choice);
make_reject_map (word, blob_choices, row, 2);
}
}
// Save best choices in the WERD_CHOICE if needed
word->best_choice->set_blob_choices(blob_choices);
set_word_fonts(word, blob_choices);
assert (word->raw_choice != NULL);
}
void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Definition at line 257 of file applybox.cpp.

{
if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
row, block)) {
word_res->CloneChoppedToRebuild();
return;
}
if (chop_debug) {
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
BLOB_CHOICE_LIST *match_result;
float rating = static_cast<float>(MAX_INT8);
for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
blob = blob->next) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
// in here, and then divide by e each time they are chopped, which
// should guarantee a set of unequal values for the whole tree of blobs
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
match_result = fake_classify_blob(0, rating, -rating);
modify_blob_choice(match_result, 0);
ASSERT_HOST(!match_result->empty());
*char_choices += match_result;
rating -= 0.125f;
}
inT32 blob_number;
int right_chop_index = 0;
// We only chop if the language is not fixed pitch like CJK.
while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
} else {
while (chop_one_blob(word_res->chopped_word, char_choices,
&blob_number, &word_res->seam_array,
&right_chop_index));
}
}
MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
word_res->CloneChoppedToRebuild();
if (char_choices != NULL) {
char_choices->delete_data_pointers();
delete char_choices;
}
}
Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 160 of file tesseractclass.h.

{
Clear();
return &pix_binary_;
}
Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 209 of file tesseractclass.h.

{
return &textord_;
}
void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)
void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)
BOOL8 tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 987 of file docqual.cpp.

{
TBOX box; // BB of outline
inT16 outline_count = 0;
inT16 small_outline_count = 0;
inT16 max_dimension;
float small_limit = kBlnXHeight * crunch_small_outlines_size;
for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width())
max_dimension = box.height();
else
max_dimension = box.width();
if (max_dimension < small_limit)
small_outline_count++;
}
}
return (small_outline_count >= outline_count);
}
BOOL8 tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 981 of file reject.cpp.

{
return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
}
BOOL8 tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 977 of file reject.cpp.

{
return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
}
int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 216 of file tesseractclass.h.

{
return sub_langs_.size();
}
BOOL8 tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
BOOL8  update_map 
)

Definition at line 456 of file reject.cpp.

{
const char *word;
const char *lengths;
inT16 word_len; //its length
inT16 first_alphanum_index_;
inT16 first_alphanum_offset_;
inT16 i;
inT16 offset;
BOOL8 non_conflict_set_char; //non conf set a/n?
BOOL8 conflict = FALSE;
BOOL8 allow_1s;
BOOL8 dict_perm_type;
BOOL8 dict_word_ok;
int dict_word_type;
word = word_res->best_choice->unichar_string().string ();
lengths = word_res->best_choice->unichar_lengths().string();
word_len = strlen (lengths);
/*
If there are no occurrences of the conflict set characters then the word
is OK.
*/
if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
return FALSE;
/*
There is a conflict if there are NO other (confirmed) alphanumerics apart
from those in the conflict set.
*/
for (i = 0, offset = 0, non_conflict_set_char = FALSE;
(i < word_len) && !non_conflict_set_char; offset += lengths[i++])
non_conflict_set_char =
(word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
!STRING (conflict_set_I_l_1).contains (word[offset]);
if (!non_conflict_set_char) {
if (update_map)
reject_I_1_L(word_res);
return TRUE;
}
/*
If the word is accepted by a dawg permuter, and the first alpha character
is "I" or "l", check to see if the alternative is also a dawg word. If it
is, then there is a potential error otherwise the word is ok.
*/
dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
(word_res->best_choice->permuter () == USER_DAWG_PERM) ||
(word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
(word_res->best_choice->permuter () == FREQ_DAWG_PERM);
dict_word_type = dict_word(*(word_res->best_choice));
dict_word_ok = (dict_word_type > 0) &&
(rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
if ((rej_1Il_use_dict_word && dict_word_ok) ||
(rej_1Il_trust_permuter_type && dict_perm_type) ||
(dict_perm_type && dict_word_ok)) {
first_alphanum_index_ = first_alphanum_index (word, lengths);
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict();
return TRUE;
}
else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
return FALSE;
}
}
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict();
return TRUE;
}
else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
return FALSE;
}
}
return FALSE;
}
/*
NEW 1Il code. The old code relied on permuter types too much. In fact,
tess will use TOP_CHOICE permute for good things like "palette".
In this code the string is examined independently to see if it looks like
a well formed word.
*/
/*
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
dictionary word.
*/
first_alphanum_index_ = first_alphanum_index (word, lengths);
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0)
return FALSE;
else
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
}
else if (lengths[first_alphanum_index_] == 1 &&
word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0)
return FALSE;
else
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
}
/*
For strings containing digits:
If there are no alphas OR the numeric permuter liked the word,
reject any non 1 conflict chs
Else reject all conflict chs
*/
if (word_contains_non_1_digit (word, lengths)) {
allow_1s = (alpha_count (word, lengths) == 0) ||
(word_res->best_choice->permuter () == NUMBER_PERM);
inT16 offset;
conflict = FALSE;
for (i = 0, offset = 0; word[offset] != '\0';
offset += word_res->best_choice->unichar_lengths()[i++]) {
if ((!allow_1s || (word[offset] != '1')) &&
STRING (conflict_set_I_l_1).contains (word[offset])) {
if (update_map)
word_res->reject_map[i].setrej_1Il_conflict ();
conflict = TRUE;
}
}
return conflict;
}
/*
For anything else. See if it conforms to an acceptable word type. If so,
treat accordingly.
*/
word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
first_alphanum_index_ = first_alphanum_index (word, lengths);
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
if (update_map)
word_res->reject_map[first_alphanum_index_].
setrej_1Il_conflict ();
return TRUE;
}
else
return FALSE;
}
else if (word_type == AC_UPPER_CASE) {
return FALSE;
}
else {
if (update_map)
reject_I_1_L(word_res);
return TRUE;
}
}
void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 72 of file output.cpp.

{
BLOCK_RES *block_of_last_word;
inT16 block_id;
BOOL8 force_eol; //During output
BLOCK *nextblock; //block of next word
WERD *nextword; //next word
page_res_it.restart_page ();
block_of_last_word = NULL;
while (page_res_it.word () != NULL) {
check_debug_pt (page_res_it.word (), 120);
if (target_word_box)
{
TBOX current_word_box=page_res_it.word ()->word->bounding_box();
FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
if (!target_word_box->contains(center_pt))
{
page_res_it.forward ();
continue;
}
}
block_of_last_word != page_res_it.block ()) {
block_of_last_word = page_res_it.block ();
block_id = block_of_last_word->block->index();
}
(page_res_it.block () != page_res_it.next_block ())) ||
(page_res_it.next_word () == NULL);
if (page_res_it.next_word () != NULL)
nextword = page_res_it.next_word ()->word;
else
nextword = NULL;
if (page_res_it.next_block () != NULL)
nextblock = page_res_it.next_block ()->block;
else
nextblock = NULL;
//regardless of tilde crunching
write_results(page_res_it,
page_res_it.block()->block,
nextword, nextblock), force_eol);
page_res_it.forward();
}
}
void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 234 of file tessedit.cpp.

{
STRING remains(lang_str);
while (remains.length() > 0) {
// Find the start of the lang code and which vector to add to.
const char* start = remains.string();
while (*start == '+')
++start;
GenericVector<STRING>* target = to_load;
if (*start == '~') {
target = not_to_load;
++start;
}
// Find the index of the end of the lang code in string start.
int end = strlen(start);
const char* plus = strchr(start, '+');
if (plus != NULL && plus - start < end)
end = plus - start;
STRING lang_code(start);
lang_code.truncate_at(end);
STRING next(start + end);
remains = next;
// Check whether lang_code is already in the target vector and add.
if (!IsStrInList(lang_code, *target)) {
tprintf("Adding language '%s' to list\n", lang_code.string());
target->push_back(lang_code);
}
}
}
void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 336 of file pgedit.cpp.

Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 164 of file tesseractclass.h.

{
return pix_binary_;
}
Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 167 of file tesseractclass.h.

{
return pix_grey_;
}
BOOL8 tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 
)

Definition at line 548 of file docqual.cpp.

{
float rating_per_ch;
int adjusted_len;
const char *str = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
BOOL8 word_crunchable;
int poor_indicator_count = 0;
word_crunchable = !crunch_leave_accept_strings ||
word->reject_map.length() < 3 ||
str, lengths) == AC_UNACCEPTABLE &&
!ok_dict_word);
adjusted_len = word->reject_map.length();
if (adjusted_len > 10)
adjusted_len = 10;
rating_per_ch = word->best_choice->rating() / adjusted_len;
if (rating_per_ch > crunch_pot_poor_rate) {
if (crunch_debug > 2) {
tprintf("Potential poor rating on \"%s\"\n",
}
poor_indicator_count++;
}
if (word_crunchable &&
if (crunch_debug > 2) {
tprintf("Potential poor cert on \"%s\"\n",
}
poor_indicator_count++;
}
if (garbage_level != G_OK) {
if (crunch_debug > 2) {
tprintf("Potential garbage on \"%s\"\n",
}
poor_indicator_count++;
}
return poor_indicator_count >= crunch_pot_indicators;
}
void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 461 of file tesseractclass.cpp.

{
pixDestroy(&cube_binary_);
cube_binary_ = pixClone(pix_binary());
// Find the max splitter strategy over all langs.
ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
for (int i = 0; i < sub_langs_.size(); ++i) {
static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
if (pageseg_strategy > max_pageseg_strategy)
max_pageseg_strategy = pageseg_strategy;
// Clone the cube image to all the sub langs too.
pixDestroy(&sub_langs_[i]->cube_binary_);
sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
pixDestroy(&sub_langs_[i]->pix_binary_);
sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
}
// Perform shiro-rekha (top-line) splitting and replace the current image by
// the newly splitted image.
splitter_.set_orig_pix(pix_binary());
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
if (splitter_.Split(true)) {
pixDestroy(&pix_binary_);
pix_binary_ = pixClone(splitter_.splitted_image());
}
}
void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 497 of file tesseractclass.cpp.

{
// Find the max splitter strategy over all langs.
for (int i = 0; i < sub_langs_.size(); ++i) {
static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
if (ocr_strategy > max_ocr_strategy)
max_ocr_strategy = ocr_strategy;
}
// Utilize the segmentation information available.
splitter_.set_segmentation_block_list(block_list);
splitter_.set_ocr_split_strategy(max_ocr_strategy);
// Run the splitter for OCR
bool split_for_ocr = splitter_.Split(false);
// Restore pix_binary to the binarized original pix for future reference.
ASSERT_HOST(splitter_.orig_pix());
pixDestroy(&pix_binary_);
pix_binary_ = pixClone(splitter_.orig_pix());
// If the pageseg and ocr strategies are different, refresh the block list
// (from the last SegmentImage call) with blobs from the real image to be used
// for OCR.
if (splitter_.HasDifferentSplitStrategies()) {
BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
pixGetHeight(pix_binary_));
Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
splitter_.orig_pix();
extract_edges(pix_for_ocr, &block);
splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
}
// The splitter isn't needed any more after this, so save memory by clearing.
splitter_.Clear();
}
BOOL8 tesseract::Tesseract::process_cmd_win_event ( inT32  cmd_event,
char *  new_value 
)

Definition at line 396 of file pgedit.cpp.

{
char msg[160];
BOOL8 exit = FALSE;
color_mode = CM_RAINBOW;
// Run recognition on the full page if needed.
switch (cmd_event) {
if (!recog_done) {
recog_done = true;
}
break;
default:
break;
}
switch (cmd_event) {
break;
mode =(CMD_EVENTS) cmd_event;
break;
word_config_ = image_win->ShowInputDialog("Config File Name");
break;
if (new_value[0] == 'T')
else
break;
if (new_value[0] == 'T')
else
break;
if (new_value[0] == 'T')
else
break;
if (new_value[0] == 'T')
else
break;
if (new_value[0] == 'T')
else
break;
if (new_value[0] == 'T')
else
break;
break;
display_image =(new_value[0] == 'T');
break;
display_blocks =(new_value[0] == 'T');
break;
display_baselines =(new_value[0] == 'T');
break;
color_mode = CM_SUBSCRIPT;
break;
color_mode = CM_SUPERSCRIPT;
break;
color_mode = CM_ITALIC;
break;
color_mode = CM_BOLD;
break;
color_mode = CM_UNDERLINE;
break;
color_mode = CM_FIXEDPITCH;
break;
color_mode = CM_SERIF;
break;
color_mode = CM_SMALLCAPS;
break;
color_mode = CM_DROPCAPS;
break;
break;
exit = TRUE;
break;
default:
sprintf(msg, "Unrecognised event " INT32FORMAT "(%s)",
cmd_event, new_value);
break;
}
return exit;
}
void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 563 of file pgedit.cpp.

{
// The following variable should remain static, since it is used by
// debug editor, which uses a single Tesseract instance.
static ICOORD down;
ICOORD up;
TBOX selection_box;
char msg[80];
switch(event.type) {
if (event.type == SVET_SELECTION) {
down.set_x(event.x + event.x_size);
down.set_y(event.y + event.y_size);
show_point(current_page_res, event.x, event.y);
}
up.set_x(event.x);
up.set_y(event.y);
selection_box = TBOX(down, up);
switch(mode) {
selection_box,
break;
selection_box,
break;
selection_box,
break;
debug_word(current_page_res, selection_box);
break;
break; // ignore up event
image_win->AddMessage("Recogging selected words");
selection_box,
break;
image_win->AddMessage("Recogging selected blobs");
break;
default:
sprintf(msg, "Mode %d not yet implemented", mode);
break;
}
default:
break;
}
}
void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res)  word_processor 
)

Definition at line 31 of file pagewalk.cpp.

{
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
page_res_it.forward()) {
WERD* word = page_res_it.word()->word;
if (word->bounding_box().overlap(selection_box)) {
if (!((this->*word_processor)(page_res_it.block()->block,
page_res_it.row()->row,
page_res_it.word())))
return;
}
}
}
bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 128 of file control.cpp.

{
if (word_config != NULL) {
if (word_box.major_overlap(target_word_box)) {
if (backup_config_file_ == NULL) {
backup_config_file_ = kBackUpConfigFile;
FILE* config_fp = fopen(backup_config_file_, "wb");
fclose(config_fp);
params());
}
} else {
if (backup_config_file_ != NULL) {
ParamUtils::ReadParamsFile(backup_config_file_,
params());
backup_config_file_ = NULL;
}
}
} else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
return false;
}
return true;
}
void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 143 of file docqual.cpp.

{
if ((tessedit_good_quality_unrej && good_quality_doc))
doc_and_block_rejection(page_res_it, good_quality_doc);
tilde_crunch(page_res_it);
tilde_delete(page_res_it);
}
}
void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 67 of file tessedit.cpp.

{
STRING path = datadir;
path += "configs/";
path += filename;
FILE* fp;
if ((fp = fopen(path.string(), "rb")) != NULL) {
fclose(fp);
} else {
path = datadir;
path += "tessconfigs/";
path += filename;
if ((fp = fopen(path.string(), "rb")) != NULL) {
fclose(fp);
} else {
path = filename;
}
}
ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
}
bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters:
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 178 of file control.cpp.

{
PAGE_RES_IT page_res_it;
inT32 word_index; // current word
}
// Before the main recognition loop below, walk through the whole page and set
// up fake words. That way, if we run out of time a user will still get the
// expected best_choice and box_words out the end; they'll just be empty.
page_res_it.page_res = page_res;
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
page_res_it.word()->SetupFake(unicharset);
}
if (dopasses==0 || dopasses==1) {
page_res_it.page_res=page_res;
page_res_it.restart_page();
// ****************** Pass 1 *******************
// Clear adaptive classifier at the beginning of the page if it is full.
// This is done only at the beginning of the page to ensure that the
// classifier is not reset at an arbitrary point while processing the page,
// which would cripple Passes 2+ if the reset happens towards the end of
// Pass 1 on a page with very difficult text.
// TODO(daria): preemptively clear the classifier if it is almost full.
// Now check the sub-langs as well.
for (int i = 0; i < sub_langs_.size(); ++i) {
if (sub_langs_[i]->AdaptiveClassifierIsFull())
}
stats_.word_count = 0;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
while (page_res_it.word() != NULL) {
stats_.word_count++;
page_res_it.forward();
}
page_res_it.restart_page();
} else {
stats_.word_count = 1;
}
word_index = 0;
stats_.dict_words = 0;
stats_.doc_blob_quality = 0;
stats_.doc_outline_errs = 0;
stats_.doc_char_quality = 0;
stats_.good_char_count = 0;
most_recently_used_ = this;
while (page_res_it.word() != NULL) {
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 30 + 50 * word_index / stats_.word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
stats_.dict_words)))
return false;
}
if (target_word_box &&
*target_word_box, word_config, 1)) {
page_res_it.forward();
continue;
}
page_res_it.block()->block,
page_res_it.row()->row,
page_res_it.word());
if (page_res_it.word()->word->flag(W_REP_CHAR)) {
fix_rep_char(&page_res_it);
page_res_it.forward();
continue;
}
word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
tprintf("Pass1: %s [%s]\n",
page_res_it.word()->best_choice->unichar_string().string(),
page_res_it.word()->best_choice->debug_string().string());
}
// tessedit_test_adaption enables testing of the accuracy of the
// input to the adaptive classifier.
if (!word_adaptable (page_res_it.word(),
// FAKE PERM REJ
} else {
// Override rejection mechanisms for this word.
for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
page_res_it.word()->reject_map[i].rejected())
page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
}
}
}
// Count dict words.
if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
++(stats_.dict_words);
// Update misadaption log (we only need to do it on pass 1, since
// adaption only happens on this pass).
if (page_res_it.word()->blamer_bundle != NULL &&
page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
}
page_res_it.forward();
}
}
if (dopasses == 1) return true;
// ****************** Pass 2 *******************
page_res_it.restart_page();
word_index = 0;
most_recently_used_ = this;
while (!tessedit_test_adaption && page_res_it.word() != NULL) {
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 80 + 10 * word_index / stats_.word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
stats_.dict_words)))
return false;
}
// changed by jetsoft
// specific to its needs to extract one word when need
if (target_word_box &&
*target_word_box, word_config, 2)) {
page_res_it.forward();
continue;
}
// end jetsoft
page_res_it.block()->block,
page_res_it.row()->row,
page_res_it.word());
if (page_res_it.word()->word->flag(W_REP_CHAR) &&
!page_res_it.word()->done) {
fix_rep_char(&page_res_it);
page_res_it.forward();
continue;
}
word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
tprintf("Pass2: %s [%s]\n",
page_res_it.word()->best_choice->unichar_string().string(),
page_res_it.word()->best_choice->debug_string().string());
}
page_res_it.forward();
}
// The next passes can only be run if tesseract has been used, as cube
// doesn't set all the necessary outputs in WERD_RES.
// ****************** Pass 3 *******************
// Fix fuzzy spaces.
fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
// ****************** Pass 4 *******************
// ****************** Pass 5,6 *******************
rejection_passes(page_res, monitor, target_word_box, word_config);
// ****************** Pass 7 *******************
// Cube combiner.
// If cube is loaded and its combiner is present, run it.
run_cube_combiner(page_res);
}
// ****************** Pass 8 *******************
// ****************** Pass 9 *******************
// Check the correctness of the final results.
blamer_pass(page_res);
}
// We aren't saving the blob choices so get rid of them now.
// set_blob_choices() does a deep clear.
page_res_it.restart_page();
while (page_res_it.word() != NULL) {
WERD_RES* word = page_res_it.word();
page_res_it.forward();
}
}
// Write results pass.
// This is now redundant, but retained commented so show how to obtain
// bounding boxes and style information.
// changed by jetsoft
// needed for dll to output memory structure
if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
output_pass(page_res_it, target_word_box);
// end jetsoft
PageSegMode pageseg_mode = static_cast<PageSegMode>(
static_cast<int>(tessedit_pageseg_mode));
textord_.CleanupSingleRowResult(pageseg_mode, page_res);
if (monitor != NULL) {
monitor->progress = 100;
}
return true;
}
BOOL8 tesseract::Tesseract::recog_interactive ( BLOCK block,
ROW row,
WERD_RES word_res 
)

recog_interactive

Recognize a single word in interactive mode.

Parameters:
blockblock
rowrow of word
word_resword to recognise

Definition at line 97 of file control.cpp.

{
inT16 char_qual;
inT16 good_char_qual;
block, row, word_res);
word_char_quality(word_res, row, &char_qual, &good_char_qual);
("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
word_res->reject_map.length(), word_blob_quality(word_res, row),
word_outline_errs(word_res), char_qual, good_char_qual);
}
return TRUE;
}
void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 72 of file control.cpp.

{
WERD *word;
ROW *pseudo_row; // row of word
BLOCK *pseudo_block; // block of word
word = make_pseudo_word(page_res, selection_box,
pseudo_block, pseudo_row);
if (word != NULL) {
WERD_RES word_res(word);
recog_interactive(pseudo_block, pseudo_row, &word_res);
delete word;
}
}
void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 88 of file recogtraining.cpp.

{
STRING box_fname = fname;
const char *lastdot = strrchr(box_fname.string(), '.');
if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
box_fname += ".box";
// read_next_box() will close box_file
FILE *box_file = open_file(box_fname.string(), "r");
PAGE_RES_IT page_res_it;
page_res_it.page_res = page_res;
page_res_it.restart_page();
STRING label;
// Process all the words on this page.
TBOX tbox; // tesseract-identified box
TBOX bbox; // box from the box file
bool keep_going;
int line_number = 0;
int examined_words = 0;
do {
keep_going = read_t(&page_res_it, &tbox);
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
// Align bottom left points of the TBOXes.
while (keep_going &&
!NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
keep_going = (bbox.bottom() < tbox.bottom()) ?
read_t(&page_res_it, &tbox) :
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
}
while (keep_going &&
!NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
}
// OCR the word if top right points of the TBOXes are similar.
if (keep_going &&
NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
ambigs_classify_and_output(page_res_it.prev_word(),
page_res_it.prev_row(),
page_res_it.prev_block(),
label.string(), output_file);
examined_words++;
}
} while (keep_going);
// Set up scripts on all of the words that did not get sent to
// ambigs_classify_and_output. They all should have, but if all the
// werd_res's don't get uch_sets, tesseract will crash when you try
// to iterate over them. :-(
int total_words = 0;
for (page_res_it.restart_page(); page_res_it.block() != NULL;
page_res_it.forward()) {
if (page_res_it.word()) {
if (page_res_it.word()->uch_set == NULL)
page_res_it.word()->SetupFake(unicharset);
total_words++;
}
}
if (examined_words < 0.85 * total_words) {
tprintf("TODO(antonova): clean up recog_training_segmented; "
" It examined only a small fraction of the ambigs image.\n");
}
tprintf("recog_training_segmented: examined %d / %d words.\n",
examined_words, total_words);
}
void tesseract::Tesseract::recog_word ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 54 of file tfacepp.cpp.

{
recog_word_recursive(word, blob_choices);
word->SetupBoxWord();
if ((word->best_choice->length() != word->box_word->length()) ||
(word->best_choice->length() != blob_choices->length())) {
tprintf("recog_word ASSERT FAIL String:\"%s\"; "
"Strlen=%d; #Blobs=%d; #Choices=%d\n",
word->best_choice->length(), word->box_word->length(),
blob_choices->length());
}
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
ASSERT_HOST(word->best_choice->length() == blob_choices->length());
/* Override the permuter type if a straight dictionary check disagrees. */
uinT8 perm_type = word->best_choice->permuter();
if ((perm_type != SYSTEM_DAWG_PERM) &&
(perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
uinT8 real_dict_perm_type = dict_word(*word->best_choice);
if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
(real_dict_perm_type == FREQ_DAWG_PERM) ||
(real_dict_perm_type == USER_DAWG_PERM)) &&
word->best_choice->unichar_lengths().string()) > 0)) {
word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
}
}
perm_type != word->best_choice->permuter()) {
tprintf("Permuter Type Flipped from %d to %d\n",
perm_type, word->best_choice->permuter());
}
}
// Factored out from control.cpp
ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
if (word->best_choice == NULL || word->best_choice->length() == 0 ||
strspn(word->best_choice->unichar_string().string(), " ") ==
word->best_choice->length()) {
word->tess_failed = true;
} else {
word->tess_failed = false;
}
}
void tesseract::Tesseract::recog_word_recursive ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 109 of file tfacepp.cpp.

{
int word_length = word->chopped_word->NumBlobs(); // no of blobs
if (word_length > MAX_UNDIVIDED_LENGTH) {
return split_and_recog_word(word, blob_choices);
}
int initial_blob_choice_len = blob_choices->length();
BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
// Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
for (int i = 0; i < tess_ratings->length(); ++i) {
blob_choices_it.add_to_end(tess_ratings->get(i));
}
delete tess_ratings;
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
// Pad raw_choice with spaces if needed.
if (word->raw_choice->length() < word_length) {
while (word->raw_choice->length() < word_length) {
word->raw_choice->append_unichar_id(space_id, 1, 0.0,
word->raw_choice->certainty());
}
}
// Do sanity checks and minor fixes on best_choice.
if (word->best_choice->length() > word_length) {
word->best_choice->make_bad(); // should never happen
tprintf("recog_word: Discarded long string \"%s\""
" (%d characters vs %d blobs)\n",
word->best_choice->length(), word_length);
tprintf("Word is at:");
word->word->bounding_box().print();
}
if (blob_choices->length() - initial_blob_choice_len != word_length) {
word->best_choice->make_bad(); // force rejection
tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
blob_choices->length(), word_length);
blob_choices_it.set_to_list(blob_choices); // list of lists
while (blob_choices->length() - initial_blob_choice_len < word_length) {
blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
tprintf("recog_word: Added dummy choice list\n");
}
while (blob_choices->length() - initial_blob_choice_len > word_length) {
blob_choices_it.move_to_last(); // should never happen
delete blob_choices_it.extract();
tprintf("recog_word: Deleted choice list\n");
}
}
if (word->best_choice->length() < word_length) {
while (word->best_choice->length() < word_length) {
word->best_choice->append_unichar_id(space_id, 1, 0.0,
}
}
}
void tesseract::Tesseract::recognize_page ( STRING image_name)
void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 427 of file reject.cpp.

{
TBOX word_box = word->word->bounding_box();
// Use the box_word as it is already denormed back to image coordinates.
int blobcount = word->box_word->length();
if (word_box.left() < tessedit_image_border ||
word_box.bottom() < tessedit_image_border ||
word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
word_box.top() + tessedit_image_border > ImageHeight() - 1) {
ASSERT_HOST(word->reject_map.length() == blobcount);
for (int blobindex = 0; blobindex < blobcount; blobindex++) {
TBOX blob_box = word->box_word->BlobBox(blobindex);
if (blob_box.left() < tessedit_image_border ||
blob_box.bottom() < tessedit_image_border ||
blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
word->reject_map[blobindex].setrej_edge_char();
// Close to edge
}
}
}
}
void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 303 of file reject.cpp.

{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
contains (word->best_choice->unichar_string()[offset])) {
//rej 1Il conflict
word->reject_map[i].setrej_1Il_conflict ();
}
}
}
void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 752 of file reject.cpp.

{
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
}
void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 588 of file control.cpp.

{
PAGE_RES_IT page_res_it(page_res);
// ****************** Pass 5 *******************
// Gather statistics on rejects.
int word_index = 0;
while (!tessedit_test_adaption && page_res_it.word() != NULL) {
WERD_RES* word = page_res_it.word();
word_index++;
if (monitor != NULL) {
monitor->ocr_alive = TRUE;
monitor->progress = 95 + 5 * word_index / stats_.word_count;
}
if (word->rebuild_word == NULL) {
// Word was not processed by tesseract.
page_res_it.forward();
continue;
}
check_debug_pt(word, 70);
// changed by jetsoft
// specific to its needs to extract one word when need
if (target_word_box &&
*target_word_box, word_config, 4)) {
page_res_it.forward();
continue;
}
// end jetsoft
page_res_it.rej_stat_word();
int chars_in_word = word->reject_map.length();
int rejects_in_word = word->reject_map.reject_count();
int blob_quality = word_blob_quality(word, page_res_it.row()->row);
stats_.doc_blob_quality += blob_quality;
int outline_errs = word_outline_errs(word);
stats_.doc_outline_errs += outline_errs;
inT16 all_char_quality;
inT16 accepted_all_char_quality;
word_char_quality(word, page_res_it.row()->row,
&all_char_quality, &accepted_all_char_quality);
stats_.doc_char_quality += all_char_quality;
uinT8 permuter_type = word->best_choice->permuter();
if ((permuter_type == SYSTEM_DAWG_PERM) ||
(permuter_type == FREQ_DAWG_PERM) ||
(permuter_type == USER_DAWG_PERM)) {
stats_.good_char_count += chars_in_word - rejects_in_word;
stats_.doc_good_char_quality += accepted_all_char_quality;
}
check_debug_pt(word, 80);
(blob_quality == 0) && (outline_errs >= chars_in_word))
check_debug_pt(word, 90);
page_res_it.forward();
}
("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
" outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
page_res->char_count, page_res->rej_count,
page_res->rej_count / static_cast<float>(page_res->char_count),
stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
stats_.doc_char_quality / static_cast<float>(page_res->char_count),
(stats_.good_char_count > 0) ?
static_cast<float>(stats_.good_char_count)) : 0.0);
}
BOOL8 good_quality_doc =
((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
(stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
(stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
(stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
// ****************** Pass 6 *******************
// Do whole document or whole block rejection pass
quality_based_rejection(page_res_it, good_quality_doc);
}
}
BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 761 of file reject.cpp.

{
inT16 char_quality;
inT16 accepted_char_quality;
if (word->best_choice->unichar_lengths().length() <= 1)
return FALSE;
contains(word->best_choice->unichar_string()[0]))
return FALSE;
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
for (int i = 1; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
}
word_char_quality(word, row, &char_quality, &accepted_char_quality);
if ((word->best_choice->unichar_lengths().length () == char_quality) &&
(char_quality == accepted_char_quality))
return TRUE;
else
return FALSE;
}
void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Definition at line 756 of file applybox.cpp.

{
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
boxfile_lineno, box_ch,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}
void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 955 of file control.cpp.

{
tprintf("New XHT Match:%s = %s ",
tprintf(" -> %s = %s ",
new_word->best_choice->debug_string().string());
tprintf(" %s->%s %s %s\n",
word->guessed_x_ht ? "GUESS" : "CERT",
new_word->guessed_x_ht ? "GUESS" : "CERT",
new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
accept_new_word ? "ACCEPTED" : "");
}
void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Definition at line 510 of file applybox.cpp.

{
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
WERD* word = word_res->word;
if (word->text() == NULL || word->text()[0] == '\0')
continue; // Ignore words that have no text.
// Convert the correct text to a vector of UNICHAR_ID
if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
if (!FindSegmentation(target_text, word_res)) {
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
}
}
bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Definition at line 341 of file applybox.cpp.

{
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
}
PAGE_RES_IT page_res_it(page_res);
WERD_RES* word_res;
for (word_res = page_res_it.word(); word_res != NULL;
word_res = page_res_it.forward()) {
if (!word_res->box_word->bounding_box().major_overlap(box))
continue;
if (applybox_debug > 1) {
tprintf("Checking word box:");
word_res->box_word->bounding_box().print();
}
int word_len = word_res->box_word->length();
for (int i = 0; i < word_len; ++i) {
TBOX char_box = TBOX();
int blob_count = 0;
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
if (!blob_box.major_overlap(box))
break;
if (word_res->correct_text[i + blob_count].length() > 0)
break; // Blob is claimed already.
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
break; // Blob is a better match for next box.
char_box += blob_box;
}
if (blob_count > 0) {
if (applybox_debug > 1) {
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
}
if (!char_box.almost_equal(box, 3) &&
(box.x_gap(next_box) < -3 ||
(prev_box != NULL && prev_box->x_gap(box) < -3))) {
return false;
}
// We refine just the box_word, best_state and correct_text here.
// The rebuild_word is made in TidyUp.
// blob_count blobs are put together to match the box. Merge the
// box_word boxes, save the blob_count in the state and the text.
word_res->box_word->MergeBoxes(i, i + blob_count);
word_res->best_state[i] = blob_count;
word_res->correct_text[i] = correct_text;
if (applybox_debug > 2) {
tprintf("%d Blobs match: blob box:", blob_count);
word_res->box_word->BlobBox(i).print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
// Eliminated best_state and correct_text entries for the consumed
// blobs.
for (int j = 1; j < blob_count; ++j) {
word_res->best_state.remove(i + 1);
word_res->correct_text.remove(i + 1);
}
// Assume that no box spans multiple source words, so we are done with
// this box.
if (applybox_debug > 1) {
tprintf("Best state = ");
for (int j = 0; j < word_res->best_state.size(); ++j) {
tprintf("%d ", word_res->best_state[j]);
}
tprintf("\n");
tprintf("Correct text = [[ ");
for (int j = 0; j < word_res->correct_text.size(); ++j) {
tprintf("%s ", word_res->correct_text[j].string());
}
tprintf("]]\n");
}
return true;
}
}
}
if (applybox_debug > 0) {
tprintf("FAIL!\n");
}
return false; // Failure.
}
bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Definition at line 439 of file applybox.cpp.

{
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
}
WERD* new_word = NULL;
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
if (!box.major_overlap(block->bounding_box()))
continue;
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
ROW* row = r_it.data();
if (!box.major_overlap(row->bounding_box()))
continue;
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (applybox_debug > 2) {
tprintf("Checking word:");
word->bounding_box().print();
}
if (word->text() != NULL && word->text()[0] != '\0')
continue; // Ignore words that are already done.
if (!box.major_overlap(word->bounding_box()))
continue;
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
blob_it.forward()) {
C_BLOB* blob = blob_it.data();
TBOX blob_box = blob->bounding_box();
if (!blob_box.major_overlap(box))
continue;
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
continue; // Blob is a better match for next box.
if (applybox_debug > 2) {
tprintf("Blob match: blob:");
blob_box.print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
if (new_word == NULL) {
// Make a new word with a single blob.
new_word = word->shallow_copy();
new_word->set_text(correct_text);
w_it.add_to_end(new_word);
}
C_BLOB_IT new_blob_it(new_word->cblob_list());
new_blob_it.add_to_end(blob_it.extract());
}
}
}
}
if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
return new_word != NULL;
}
void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 433 of file tesseractclass.cpp.

{
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->ResetAdaptiveClassifierInternal();
}
}
void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 441 of file tesseractclass.cpp.

{
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->getDict().ResetDocumentDictionary();
}
}
const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 156 of file tesseractclass.h.

{
return reskew_;
}
bool tesseract::Tesseract::RetryWithLanguage ( WERD_RES word,
BLOCK block,
ROW row,
WordRecognizer  recognizer 
)

Definition at line 756 of file control.cpp.

{
tprintf("Retrying word using lang %s, oem %d\n",
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
}
// Setup a trial WERD_RES in which to classify.
WERD_RES lang_word;
lang_word.InitForRetryRecognition(*word);
// Run the recognizer on the word.
// Initial version is a bit of a hack based on better certainty and rating
// (to reduce false positives from cube) or a dictionary vs non-dictionary
// word.
(this->*recognizer)(block, row, &lang_word);
bool new_is_better = NewWordBetter(*word, lang_word);
if (lang_word.best_choice == NULL) {
tprintf("New result %s better:%s\n",
new_is_better ? "IS" : "NOT");
} else {
tprintf("New result %s better:%s, r=%g, c=%g\n",
new_is_better ? "IS" : "NOT",
lang_word.best_choice->rating(),
lang_word.best_choice->certainty());
}
}
if (new_is_better) {
word->ConsumeWordResults(&lang_word);
}
return new_is_better;
}
bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 213 of file tesseractclass.h.

{
return right_to_left_;
}
void tesseract::Tesseract::run_cube_combiner ( PAGE_RES page_res)

Definition at line 241 of file cube_control.cpp.

{
if (page_res == NULL || tess_cube_combiner_ == NULL)
return;
PAGE_RES_IT page_res_it(page_res);
// Iterate through the word results and call cube on each word.
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES* word = page_res_it.word();
// Skip cube entirely if tesseract's certainty is greater than threshold.
int combiner_run_thresh = convert_prob_to_tess_certainty(
cube_cntxt_->Params()->CombinerRunThresh());
if (word->best_choice->certainty() >= combiner_run_thresh) {
continue;
}
// Use the same language as Tesseract used for the word.
Tesseract* lang_tess = word->tesseract;
// Setup a trial WERD_RES in which to classify with cube.
WERD_RES cube_word;
cube_word.InitForRetryRecognition(*word);
CubeObject *cube_obj = lang_tess->cube_recognize_word(
page_res_it.block()->block, &cube_word);
if (cube_obj != NULL)
lang_tess->cube_combine_word(cube_obj, &cube_word, word);
delete cube_obj;
}
}
bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)
inT16 tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 786 of file reject.cpp.

{
const WERD_CHOICE &word = *werd_res->best_choice;
int dict_word_type = werd_res->tesseract->dict_word(word);
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
}
Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 196 of file tesseractclass.h.

{
return scaled_color_;
}
int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 199 of file tesseractclass.h.

{
return scaled_factor_;
}
void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Definition at line 625 of file applybox.cpp.

{
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
// Rating of matching choice or worst choice if no match.
float choice_rating = 0.0f;
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
choice_rating = choice->rating();
UNICHAR_ID class_id = choice->unichar_id();
if (class_id == target_text[text_index]) {
break;
}
// Search ambigs table.
if (class_id < table.size() && table[class_id] != NULL) {
AmbigSpec_IT spec_it(table[class_id]);
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
spec_it.forward()) {
const AmbigSpec *ambig_spec = spec_it.data();
// We'll only do 1-1.
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
ambig_spec->correct_ngram_id == target_text[text_index])
break;
}
if (!spec_it.cycled_list())
break; // Found an ambig.
}
}
if (choice_it.cycled_list())
continue; // No match.
segmentation->push_back(length);
if (choices_pos + length == choices_length &&
text_index + 1 == target_text.size()) {
// This is a complete match. If the rating is good record a new best.
if (applybox_debug > 2) {
tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
rating + choice_rating, *best_rating, segmentation->size(),
best_segmentation->size());
}
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
*best_segmentation = *segmentation;
*best_rating = rating + choice_rating;
}
} else if (choices_pos + length < choices_length &&
text_index + 1 < target_text.size()) {
if (applybox_debug > 3) {
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]),
choice_it.data()->unichar_id() == target_text[text_index]
? "Match" : "Ambig",
choices_pos, length);
}
SearchForText(choices, choices_pos + length, choices_length, target_text,
text_index + 1, rating + choice_rating, segmentation,
best_rating, best_segmentation);
if (applybox_debug > 3) {
tprintf("End recursion for %d=%s\n", target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]));
}
}
segmentation->truncate(segmentation->size() - 1);
}
}
int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.

Definition at line 107 of file pagesegmain.cpp.

{
ASSERT_HOST(pix_binary_ != NULL);
int width = pixGetWidth(pix_binary_);
int height = pixGetHeight(pix_binary_);
// Get page segmentation mode.
PageSegMode pageseg_mode = static_cast<PageSegMode>(
static_cast<int>(tessedit_pageseg_mode));
// If a UNLV zone file can be found, use that instead of segmentation.
if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
input_file != NULL && input_file->length() > 0) {
STRING name = *input_file;
const char* lastdot = strrchr(name.string(), '.');
if (lastdot != NULL)
name[lastdot - name.string()] = '\0';
read_unlv_file(name, width, height, blocks);
}
if (blocks->empty()) {
// No UNLV file present. Work according to the PageSegMode.
// First make a single block covering the whole image.
BLOCK_IT block_it(blocks);
BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
block_it.add_to_end(block);
} else {
// UNLV file present. Use PSM_SINGLE_BLOCK.
pageseg_mode = PSM_SINGLE_BLOCK;
}
bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode);
bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode);
bool osd_only = pageseg_mode == PSM_OSD_ONLY;
int auto_page_seg_ret_val = 0;
TO_BLOCK_LIST to_blocks;
if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) {
auto_page_seg_ret_val =
AutoPageSeg(single_column, osd_enabled, osd_only,
blocks, &to_blocks, osd_tess, osr);
if (osd_only)
return auto_page_seg_ret_val;
// To create blobs from the image region bounds uncomment this line:
// to_blocks.clear(); // Uncomment to go back to the old mode.
} else {
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
if (pageseg_mode == PSM_CIRCLE_WORD) {
Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
if (pixcleaned != NULL) {
pixDestroy(&pix_binary_);
pix_binary_ = pixcleaned;
}
}
}
if (auto_page_seg_ret_val < 0) {
return -1;
}
if (blocks->empty()) {
tprintf("Empty page\n");
return 0; // AutoPageSeg found an empty page.
}
textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
blocks, &to_blocks);
return auto_page_seg_ret_val;
}
void tesseract::Tesseract::set_done ( WERD_RES word,
inT16  pass 
)
void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 170 of file tesseractclass.h.

{
pixDestroy(&pix_grey_);
pix_grey_ = grey_pix;
}
void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 187 of file tesseractclass.h.

{
source_resolution_ = ppi;
}
void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 371 of file output.cpp.

{
int len = word_res->reject_map.length();
const WERD_CHOICE &word = *(word_res->best_choice);
const UNICHARSET &uchset = *word.unicharset();
int i;
float rating_per_ch;
if (suspect_level == 0) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected())
word_res->reject_map[i].setrej_minimal_rej_accept();
}
return;
}
if (suspect_level >= 3)
return; //Use defaults
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word(word_res) &&
/* Unreject alphas in dictionary words */
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
uchset.get_isalpha(word.unichar_id(i)))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
rating_per_ch = word.rating() / word_res->reject_map.length();
if (rating_per_ch >= suspect_rating_per_ch)
return; //Dont touch bad ratings
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
(!uchset.eq(word.unichar_id(i), " ")))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if (word_res->reject_map[i].flag(R_DOC_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_BLOCK_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_ROW_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
if (suspect_level == 2)
return;
(word_res->reject_map.length() <= suspect_short_words)) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL)))
word_res->reject_map[i].setrej_minimal_rej_accept();
word_res->reject_map[i].flag(R_MM_REJECT))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
if (acceptable_word_string(*word_res->uch_set,
word.unichar_lengths().string()) !=
word.unichar_lengths().string())) {
if (word_res->reject_map.length() > suspect_short_words) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected() &&
(!word_res->reject_map[i].perm_rejected() ||
word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
word_res->reject_map[i].flag (R_POSTNN_1IL) ||
word_res->reject_map[i].flag (R_MM_REJECT))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
}
void tesseract::Tesseract::set_word_fonts ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

set_word_fonts

Get the fonts for the word.

Definition at line 1500 of file control.cpp.

{
if (blob_choices == NULL) return;
// Don't try to set the word fonts for a cube word, as the configs
// will be meaningless.
if (word->chopped_word == NULL) return;
inT32 index; // char id index
// character iterator
BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
BLOB_CHOICE_IT choice_it; // choice iterator
int fontinfo_size = get_fontinfo_table().size();
int fontset_size = get_fontset_table().size();
if (fontinfo_size == 0 || fontset_size == 0) return;
STATS fonts(0, fontinfo_size); // font counters
word->italic = 0;
word->bold = 0;
}
// Compute the modal font for the word
for (char_it.mark_cycle_pt(), index = 0;
!char_it.cycled_list(); ++index, char_it.forward()) {
UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
choice_it.set_to_list(char_it.data());
tprintf("Examining fonts in %s\n",
}
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
if (blob_ch_id == word_ch_id) {
tprintf("%s font %s (%d) font2 %s (%d)\n",
word->uch_set->id_to_unichar(blob_ch_id),
choice_it.data()->fontinfo_id() < 0 ? "unknown" :
fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
choice_it.data()->fontinfo_id(),
choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
choice_it.data()->fontinfo_id2());
}
// 1st choice font gets 2 pts, 2nd choice 1 pt.
if (choice_it.data()->fontinfo_id() >= 0) {
fonts.add(choice_it.data()->fontinfo_id(), 2);
}
if (choice_it.data()->fontinfo_id2() >= 0) {
fonts.add(choice_it.data()->fontinfo_id2(), 1);
}
break;
}
}
}
inT16 font_id1, font_id2;
find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
// All the blobs get the word's best choice font.
for (int i = 0; i < word->best_choice->length(); ++i) {
}
if (word->fontinfo_id_count > 0) {
FontInfo fi = fontinfo_table_.get(font_id1);
if (word->fontinfo_id2_count > 0) {
tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
} else {
tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
fi.name, word->fontinfo_id_count);
}
}
// 1st choices got 2 pts, so we need to halve the score for the mode.
word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
}
}
void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 448 of file tesseractclass.cpp.

{
// Set the white and blacklists (if any)
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
}
}
void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 427 of file tesseractclass.cpp.

{
equ_detect_ = detector;
equ_detect_->SetLangTesseract(this);
}
void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 202 of file tesseractclass.h.

{
scaled_factor_ = factor;
scaled_color_ = color;
}
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Definition at line 197 of file applybox.cpp.

{
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
ROW* row = r_it.data();
float diff = fabs(row->x_height() - median_xheight);
if (diff > max_deviation) {
tprintf("row xheight=%g, but median xheight = %g\n",
row->x_height(), median_xheight);
}
row->set_x_height(static_cast<float>(median_xheight));
}
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (word->cblob_list()->empty()) {
delete w_it.extract();
} else {
word->set_flag(W_FUZZY_SP, false);
word->set_flag(W_FUZZY_NON, false);
}
}
}
}
PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
while ((word_res = pr_it.word()) != NULL) {
MaximallyChopWord(boxes, pr_it.block()->block,
pr_it.row()->row, word_res);
pr_it.forward();
}
return page_res;
}
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( bool  single_column,
bool  osd,
bool  only_osd,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 281 of file pagesegmain.cpp.

{
int vertical_x = 0;
int vertical_y = 1;
TabVector_LIST v_lines;
TabVector_LIST h_lines;
ICOORD bleft(0, 0);
ASSERT_HOST(pix_binary_ != NULL);
pixWrite("tessinput.png", pix_binary_, IFF_PNG);
}
// Leptonica is used to find the rule/separator lines in the input.
LineFinder::FindAndRemoveLines(source_resolution_,
&vertical_x, &vertical_y, music_mask_pix,
&v_lines, &h_lines);
pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
// Leptonica is used to find a mask of the photo regions in the input.
*photo_mask_pix = ImageFind::FindImages(pix_binary_);
pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
if (single_column)
v_lines.clear();
// The rest of the algorithm uses the usual connected components.
textord_.find_components(pix_binary_, blocks, to_blocks);
TO_BLOCK_IT to_block_it(to_blocks);
// There must be exactly one input block.
// TODO(rays) handle new textline finding with a UNLV zone file.
ASSERT_HOST(to_blocks->singleton());
TO_BLOCK* to_block = to_block_it.data();
TBOX blkbox = to_block->block->bounding_box();
ColumnFinder* finder = NULL;
if (to_block->line_size >= 2) {
finder = new ColumnFinder(static_cast<int>(to_block->line_size),
blkbox.botleft(), blkbox.topright(),
source_resolution_,
&v_lines, &h_lines, vertical_x, vertical_y);
finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
if (equ_detect_) {
equ_detect_->LabelSpecialText(to_block);
}
BLOBNBOX_CLIST osd_blobs;
// osd_orientation is the number of 90 degree rotations to make the
// characters upright. (See osdetect.h for precise definition.)
// We want the text lines horizontal, (vertical text indicates vertical
// textlines) which may conflict (eg vertically written CJK).
int osd_orientation = 0;
bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
if (osd && osd_tess != NULL && osr != NULL) {
os_detect_blobs(&osd_blobs, osr, osd_tess);
if (only_osd) {
delete finder;
return NULL;
}
osd_orientation = osr->best_result.orientation_id;
double osd_score = osr->orientations[osd_orientation];
double osd_margin = min_orientation_margin * 2;
for (int i = 0; i < 4; ++i) {
if (i != osd_orientation &&
osd_score - osr->orientations[i] < osd_margin) {
osd_margin = osd_score - osr->orientations[i];
}
}
if (osd_margin < min_orientation_margin) {
// The margin is weak.
int best_script_id = osr->best_result.script_id;
bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
(best_script_id == osd_tess->unicharset.hiragana_sid()) ||
(best_script_id == osd_tess->unicharset.katakana_sid());
if (!cjk && !vertical_text && osd_orientation == 2) {
// upside down latin text is improbable with such a weak margin.
tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
"Don't rotate.\n", osd_margin);
osd_orientation = 0;
} else {
tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
"but using orientation anyway: %d\n",
osd_blobs.length(), osd_margin, osd_orientation);
}
}
}
osd_blobs.shallow_clear();
finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
}
return finder;
}
void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 399 of file tessedit.cpp.

{
// Note that we can get away with bitwise copying FontInfo in
// all_fonts, as it is a temporary structure and we avoid setting the
// delete callback.
// Create the universal ID table.
CollectFonts(get_fontinfo_table(), &all_fonts);
for (int i = 0; i < sub_langs_.size(); ++i) {
CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
}
// Assign ids from the table to each font table.
AssignIds(all_fonts, &get_fontinfo_table());
for (int i = 0; i < sub_langs_.size(); ++i) {
AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
}
font_table_size_ = all_fonts.size();
}
void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)
int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 184 of file tesseractclass.h.

{
return source_resolution_;
}
void tesseract::Tesseract::split_and_recog_word ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 177 of file tfacepp.cpp.

{
// Find the biggest blob gap in the chopped_word.
int bestgap = -MAX_INT32;
TPOINT best_split_pt;
TBLOB* best_end = NULL;
TBLOB* prev_blob = NULL;
for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
blob = blob->next) {
if (prev_blob != NULL) {
TBOX prev_box = prev_blob->bounding_box();
TBOX blob_box = blob->bounding_box();
int gap = blob_box.left() - prev_box.right();
if (gap > bestgap) {
bestgap = gap;
best_end = prev_blob;
best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
best_split_pt.y = (prev_box.top() + prev_box.bottom() +
blob_box.top() + blob_box.bottom()) / 4;
}
}
prev_blob = blob;
}
ASSERT_HOST(best_end != NULL);
ASSERT_HOST(best_end->next != NULL);
// Make a copy of the word to put the 2nd half in.
WERD_RES* word2 = new WERD_RES(*word);
// Blow away the copied chopped_word, as we want to work with the blobs
// from the input chopped_word so the seam_arrays can be merged.
delete word2->chopped_word;
word2->chopped_word = new TWERD;
word2->chopped_word->blobs = best_end->next;
best_end->next = NULL;
// Make a new seamarray on both words.
BlamerBundle *orig_bb = word->blamer_bundle;
STRING blamer_debug;
// Try to adjust truth information.
if (orig_bb != NULL) {
// Find truth boxes that correspond to the split in the blobs.
int b;
int begin2_truth_index = -1;
int end1_x = best_end->bounding_box().right();
int begin2_x = word2->chopped_word->blobs->bounding_box().left();
blamer_debug = "Looking for truth split at";
blamer_debug.add_str_int(" end1_x ", end1_x);
blamer_debug.add_str_int(" begin2_x ", begin2_x);
blamer_debug += "\nnorm_truth_word boxes:\n";
if (orig_bb->norm_truth_word.length() > 1) {
orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
orig_bb->norm_box_tolerance) &&
(abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
orig_bb->norm_box_tolerance)) {
begin2_truth_index = b;
blamer_debug += "Split found\n";
break;
}
}
}
}
// Populate truth information in word and word2 with the first and second
// part of the original truth.
word->blamer_bundle = new BlamerBundle();
word2->blamer_bundle = new BlamerBundle();
if (begin2_truth_index > 0) {
BlamerBundle *curr_bb = word->blamer_bundle;
for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
b, orig_bb->norm_truth_word.BlobBox(b));
curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
}
} else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
} else {
blamer_debug += "Truth split not found";
blamer_debug += orig_bb->truth_has_char_boxes ?
"\n" : " (no truth char boxes)\n";
word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
}
}
// Recognize the first part of the word.
recog_word_recursive(word, blob_choices);
// Recognize the second part of the word.
recog_word_recursive(word2, blob_choices);
// Tack the word2 outputs onto the end of the word outputs.
// New blobs might have appeared on the end of word1.
for (best_end = word->chopped_word->blobs; best_end->next != NULL;
best_end = best_end->next);
best_end->next = word2->chopped_word->blobs;
TBLOB* blob;
for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
blob->next = word2->rebuild_word->blobs;
word2->chopped_word->blobs = NULL;
word2->rebuild_word->blobs = NULL;
// Copy the seams onto the end of the word1 seam_array.
// Since the seam list is one element short, an empty seam marking the
// end of the last blob in the first word is needed first.
new_seam(0.0, best_split_pt, NULL, NULL, NULL));
for (int i = 0; i < array_count(word2->seam_array); ++i) {
SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
array_value(word2->seam_array, i) = NULL;
word->seam_array = add_seam(word->seam_array, seam);
}
word->best_state += word2->best_state;
// Append the word choices.
*word->best_choice += *word2->best_choice;
*word->raw_choice += *word2->raw_choice;
// How many alt choices from each should we try to get?
const int kAltsPerPiece = 2;
// When do we start throwing away extra alt choices?
const int kTooManyAltChoices = 100;
if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
// Construct the cartesian product of the alt choices of word(1) and word2.
int num_first_alt_choices = word->alt_choices.size();
// Nota Bene: For the main loop here, we leave in place word1-only
// alt_choices in
// word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
// These will get fused with the best choices for word2 below.
for (int j = 1; j < word2->alt_choices.size() &&
(j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
j++) {
for (int i = 0; i < num_first_alt_choices &&
(i <= kAltsPerPiece ||
word->alt_choices.size() < kTooManyAltChoices);
i++) {
WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
*wc += *word2->alt_choices[j];
GenericVector<int> &alt_state = word->alt_states.back();
alt_state += word->alt_states[i];
alt_state += word2->alt_states[j];
}
}
// Now that we've filled in as many alternates as we want, paste the best
// choice for word2 onto the original word alt_choices.
for (int i = 0; i < num_first_alt_choices; i++) {
*word->alt_choices[i] += *word2->alt_choices[0];
word->alt_states[i] += word2->alt_states[0];
}
}
// Restore the pointer to original blamer bundle and combine blamer
// information recorded in the splits.
if (orig_bb != NULL) {
if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
blamer_debug += "Blame from part 1: ";
blamer_debug += word->blamer_bundle->debug;
}
blamer_debug += "Blame from part 2: ";
blamer_debug += word2->blamer_bundle->debug;
if (irr == IRR_CORRECT) {
} else if (irr != word2->blamer_bundle->incorrect_result_reason) {
irr = IRR_UNKNOWN;
}
}
delete word->blamer_bundle;
word->blamer_bundle = orig_bb;
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
}
}
delete word2;
}
BOOL8 tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 510 of file docqual.cpp.

{
float rating_per_ch;
int adjusted_len;
int crunch_mode = 0;
if ((word->best_choice->unichar_string().length () == 0) ||
(strspn (word->best_choice->unichar_string().string(), " ") ==
crunch_mode = 1;
else {
adjusted_len = word->reject_map.length ();
if (adjusted_len > crunch_rating_max)
adjusted_len = crunch_rating_max;
rating_per_ch = word->best_choice->rating () / adjusted_len;
if (rating_per_ch > crunch_terrible_rating)
crunch_mode = 2;
else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
crunch_mode = 3;
(garbage_level != G_OK))
crunch_mode = 4;
else if ((rating_per_ch > crunch_poor_garbage_rate) &&
(garbage_level != G_OK))
crunch_mode = 5;
}
if (crunch_mode > 0) {
if (crunch_debug > 2) {
tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
crunch_mode, word->best_choice->unichar_string().string());
}
return TRUE;
}
else
return FALSE;
}
BOOL8 tesseract::Tesseract::tess_acceptable_word ( WERD_CHOICE word_choice,
WERD_CHOICE raw_choice 
)

Definition at line 102 of file tessbox.cpp.

{ // before context
return getDict().AcceptableResult(*word_choice);
}
void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 114 of file tessbox.cpp.

{
getDict().add_document_word(*word_choice);
}
void tesseract::Tesseract::tess_segment_pass1 ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 42 of file tessbox.cpp.

{
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
if (word->word->flag(W_DONT_CHOP)) {
saved_enable_assoc = wordrec_enable_assoc;
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(0);
chop_enable.set_value(0);
if (word->word->flag(W_REP_CHAR))
getDict().permute_only_top.set_value(true);
}
recog_word(word, blob_choices);
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
getDict().permute_only_top.set_value(false);
}
}
void tesseract::Tesseract::tess_segment_pass2 ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 73 of file tessbox.cpp.

{
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
if (word->word->flag(W_DONT_CHOP)) {
saved_enable_assoc = wordrec_enable_assoc;
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(0);
chop_enable.set_value(0);
if (word->word->flag(W_REP_CHAR))
getDict().permute_only_top.set_value(true);
}
recog_word(word, blob_choices);
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
getDict().permute_only_top.set_value(false);
}
}
BOOL8 tesseract::Tesseract::test_ambig_word ( WERD_RES word)

Definition at line 687 of file reject.cpp.

{
BOOL8 ambig = FALSE;
if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
(word->best_choice->permuter () == USER_DAWG_PERM)) {
word->best_choice, NULL, false, NULL, NULL);
}
return ambig;
}
const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 206 of file tesseractclass.h.

{
return textord_;
}
void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)

Definition at line 702 of file applybox.cpp.

{
int ok_blob_count = 0;
int bad_blob_count = 0;
int ok_word_count = 0;
int unlabelled_words = 0;
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
int ok_in_word = 0;
for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
if (word_res->correct_text[i].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
MakeWordChoice(char_choices, unicharset, word_res->best_choice);
} else {
++unlabelled_words;
if (applybox_debug > 0) {
tprintf("APPLY_BOXES: Unlabelled word at :");
word_res->word->bounding_box().print();
}
pr_it.DeleteCurrentWord();
}
char_choices.delete_data_pointers();
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
// Denormalize back to a BoxWord.
word_res->RebuildBestState();
word_res->SetupBoxWord();
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
}
if (applybox_debug > 0) {
tprintf(" Found %d good blobs.\n", ok_blob_count);
if (bad_blob_count > 0) {
tprintf(" Leaving %d unlabelled blobs in %d words.\n",
bad_blob_count, ok_word_count);
}
if (unlabelled_words > 0)
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
}
}
void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 424 of file docqual.cpp.

{
WERD_RES *word;
GARBAGE_LEVEL garbage_level;
PAGE_RES_IT copy_it;
BOOL8 prev_potential_marked = FALSE;
BOOL8 found_terrible_word = FALSE;
BOOL8 ok_dict_word;
page_res_it.restart_page();
while (page_res_it.word() != NULL) {
POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
if (pb != NULL && !pb->IsText()) {
page_res_it.forward();
continue;
}
word = page_res_it.word();
if (word->reject_map.accept_count () != 0) {
found_terrible_word = FALSE;
//Forget earlier potential crunches
prev_potential_marked = FALSE;
}
else {
ok_dict_word = safe_dict_word(word);
garbage_level = garbage_word (word, ok_dict_word);
if ((garbage_level != G_NEVER_CRUNCH) &&
(terrible_word_crunch (word, garbage_level))) {
if (crunch_debug > 0) {
tprintf ("T CRUNCHING: \"%s\"\n",
}
if (prev_potential_marked) {
while (copy_it.word () != word) {
if (crunch_debug > 0) {
tprintf ("P1 CRUNCHING: \"%s\"\n",
}
copy_it.forward ();
}
prev_potential_marked = FALSE;
}
found_terrible_word = TRUE;
}
else if ((garbage_level != G_NEVER_CRUNCH) &&
garbage_level, ok_dict_word))) {
if (found_terrible_word) {
if (crunch_debug > 0) {
tprintf ("P2 CRUNCHING: \"%s\"\n",
}
}
else if (!prev_potential_marked) {
copy_it = page_res_it;
prev_potential_marked = TRUE;
if (crunch_debug > 1) {
tprintf ("P3 CRUNCHING: \"%s\"\n",
}
}
}
else {
found_terrible_word = FALSE;
//Forget earlier potential crunches
prev_potential_marked = FALSE;
if (crunch_debug > 2) {
tprintf ("NO CRUNCH: \"%s\"\n",
}
}
}
page_res_it.forward ();
}
}
void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 596 of file docqual.cpp.

{
WERD_RES *word;
PAGE_RES_IT copy_it;
BOOL8 deleting_from_bol = FALSE;
BOOL8 marked_delete_point = FALSE;
inT16 debug_delete_mode;
CRUNCH_MODE delete_mode;
inT16 x_debug_delete_mode;
CRUNCH_MODE x_delete_mode;
page_res_it.restart_page();
while (page_res_it.word() != NULL) {
word = page_res_it.word();
delete_mode = word_deletable (word, debug_delete_mode);
if (delete_mode != CR_NONE) {
if (word->word->flag (W_BOL) || deleting_from_bol) {
if (crunch_debug > 0) {
tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
debug_delete_mode,
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = TRUE;
} else if (word->word->flag(W_EOL)) {
if (marked_delete_point) {
while (copy_it.word() != word) {
x_delete_mode = word_deletable (copy_it.word (),
x_debug_delete_mode);
if (crunch_debug > 0) {
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
x_debug_delete_mode,
}
copy_it.word ()->unlv_crunch_mode = x_delete_mode;
copy_it.forward ();
}
}
if (crunch_debug > 0) {
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
debug_delete_mode,
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = FALSE;
marked_delete_point = FALSE;
}
else {
if (!marked_delete_point) {
copy_it = page_res_it;
marked_delete_point = TRUE;
}
}
}
else {
deleting_from_bol = FALSE;
//Forget earlier potential crunches
marked_delete_point = FALSE;
}
/*
The following step has been left till now as the tess fails are used to
determine if the word is deletable.
*/
page_res_it.forward ();
}
}
bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 976 of file control.cpp.

{
bool accept_new_x_ht = false;
int original_misfits = CountMisfitTops(word);
if (original_misfits == 0)
return false;
float new_x_ht = ComputeCompatibleXheight(word);
if (new_x_ht > 0.0f) {
WERD_RES new_x_ht_word(word->word);
if (word->blamer_bundle != NULL) {
new_x_ht_word.blamer_bundle = new BlamerBundle();
new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
}
new_x_ht_word.x_height = new_x_ht;
new_x_ht_word.caps_height = 0.0;
match_word_pass2(&new_x_ht_word, row, block);
if (!new_x_ht_word.tess_failed) {
int new_misfits = CountMisfitTops(&new_x_ht_word);
if (debug_x_ht_level >= 1) {
tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
original_misfits, word->x_height,
new_misfits, new_x_ht);
tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
new_x_ht_word.best_choice->rating(),
new_x_ht_word.best_choice->certainty());
}
// The misfits must improve and either the rating or certainty.
accept_new_x_ht = new_misfits < original_misfits &&
(new_x_ht_word.best_choice->certainty() >
word->best_choice->certainty() ||
new_x_ht_word.best_choice->rating() <
word->best_choice->rating());
if (debug_x_ht_level >= 1) {
ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
}
}
if (accept_new_x_ht) {
word->ConsumeWordResults(&new_x_ht_word);
return true;
}
}
return false;
}
BOOL8 tesseract::Tesseract::uniformly_spaced ( WERD_RES word)

Definition at line 515 of file fixspace.cpp.

{
TBOX box;
inT16 prev_right = -MAX_INT16;
inT16 gap;
inT16 max_gap = -MAX_INT16;
inT16 max_gap_count = 0;
STATS gap_stats(0, MAXSPACING);
BOOL8 result;
const ROW *row = word->denorm.row();
float max_non_space;
float normalised_max_nonspace;
inT16 i = 0;
inT16 offset = 0;
STRING punct_chars = "\"`',.:;";
for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
blob = blob->next) {
box = blob->bounding_box();
if ((prev_right > -MAX_INT16) &&
(!punct_chars.contains(
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
!punct_chars.contains(
word->best_choice->unichar_string()[offset]))) {
gap = box.left() - prev_right;
if (gap < max_gap) {
gap_stats.add(gap, 1);
} else if (gap == max_gap) {
max_gap_count++;
} else {
if (max_gap_count > 0)
gap_stats.add(max_gap, max_gap_count);
max_gap = gap;
max_gap_count = 1;
}
}
prev_right = box.right();
offset += word->best_choice->unichar_lengths()[i++];
}
max_non_space = (row->space() + 3 * row->kern()) / 4;
normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
result = (
gap_stats.get_total() == 0 ||
max_gap <= normalised_max_nonspace ||
(gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
(gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
#ifndef SECURE_NAMES
if ((debug_fix_space_level > 1)) {
if (result) {
"ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
"total=%d mean=%f median=%f\n",
word->best_choice->unichar_string().string(), normalised_max_nonspace,
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
gap_stats.median());
} else {
"REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
"total=%d mean=%f median=%f\n",
word->best_choice->unichar_string().string(), normalised_max_nonspace,
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
gap_stats.median());
}
}
#endif
return result;
}
void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 120 of file docqual.cpp.

{
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return;
DocQualCallbacks cb(word);
*word->rebuild_word,
}
void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 166 of file docqual.cpp.

{
WERD_RES *word;
ROW_RES *current_row;
BLOCK_RES *current_block;
int i;
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
check_debug_pt (page_res_it.word (), 100);
if (bland_unrej) {
word = page_res_it.word ();
for (i = 0; i < word->reject_map.length (); i++) {
if (word->reject_map[i].accept_if_good_quality ())
word->reject_map[i].setrej_quality_accept ();
}
page_res_it.forward ();
}
else if ((page_res_it.row ()->char_count > 0) &&
((page_res_it.row ()->rej_count /
(float) page_res_it.row ()->char_count) <=
word = page_res_it.word ();
unrej_good_chs(word, page_res_it.row ()->row);
}
page_res_it.forward ();
}
else {
/* Skip to end of dodgy row */
current_row = page_res_it.row ();
while ((page_res_it.word () != NULL) &&
(page_res_it.row () == current_row))
page_res_it.forward ();
}
check_debug_pt (page_res_it.word (), 110);
}
page_res_it.restart_page ();
page_res_it.page_res->char_count = 0;
page_res_it.page_res->rej_count = 0;
current_block = NULL;
current_row = NULL;
while (page_res_it.word () != NULL) {
if (current_block != page_res_it.block ()) {
current_block = page_res_it.block ();
current_block->char_count = 0;
current_block->rej_count = 0;
}
if (current_row != page_res_it.row ()) {
current_row = page_res_it.row ();
current_row->char_count = 0;
current_row->rej_count = 0;
current_row->whole_word_rej_count = 0;
}
page_res_it.rej_stat_word ();
page_res_it.forward ();
}
}
BOOL8 tesseract::Tesseract::word_adaptable ( WERD_RES word,
uinT16  mode 
)

Definition at line 50 of file adaptions.cpp.

{
tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
word->best_choice == NULL ? "" :
word->best_choice->rating(), word->best_choice->certainty());
}
BOOL8 status = FALSE;
BITS16 flags(mode);
enum MODES
{
ADAPTABLE_WERD,
ACCEPTABLE_WERD,
CHECK_DAWGS,
CHECK_SPACES,
CHECK_ONE_ELL_CONFLICT,
CHECK_AMBIG_WERD
};
/*
0: NO adaption
*/
if (mode == 0) {
if (tessedit_adaption_debug) tprintf("adaption disabled\n");
return FALSE;
}
if (flags.bit (ADAPTABLE_WERD)) {
status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
if (tessedit_adaption_debug && !status) {
tprintf("tess_would_adapt bit is false\n");
}
}
if (flags.bit (ACCEPTABLE_WERD)) {
status |= word->tess_accepted;
if (tessedit_adaption_debug && !status) {
tprintf("tess_accepted bit is false\n");
}
}
if (!status) { // If not set then
return FALSE; // ignore other checks
}
if (flags.bit (CHECK_DAWGS) &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
return FALSE;
}
if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
return FALSE;
}
if (flags.bit (CHECK_SPACES) &&
(strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
if (tessedit_adaption_debug) tprintf("word contains spaces\n");
return FALSE;
}
// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
if (flags.bit (CHECK_AMBIG_WERD) &&
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
return FALSE;
}
// Do not adapt to words that are composed from fragments if
// tessedit_adapt_to_char_fragments is false.
const char *fragment_lengths = word->best_choice->fragment_lengths();
if (fragment_lengths != NULL && *fragment_lengths != '\0') {
for (int i = 0; i < word->best_choice->length(); ++i) {
if (fragment_lengths[i] > 1) {
if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
return false; // found a character composed from fragments
}
}
}
}
tprintf("returning status %d\n", status);
}
return status;
}
BOOL8 tesseract::Tesseract::word_blank_and_set_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

Definition at line 711 of file pgedit.cpp.

{
return word_set_display(block, row, word_res);
}
BOOL8 tesseract::Tesseract::word_bln_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_bln_display()

Normalize word and display in word window

Definition at line 724 of file pgedit.cpp.

{
TWERD *bln_word = word_res->chopped_word;
if (bln_word == NULL) {
word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
row, block);
bln_word = word_res->chopped_word;
}
1.0, 0.0f, -1000.0f, 1000.0f);
return TRUE;
}
inT16 tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 68 of file docqual.cpp.

{
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return 0;
DocQualCallbacks cb(word);
*word->rebuild_word,
return cb.match_count;
}
void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
inT16 match_count,
inT16 accepted_match_count 
)

Definition at line 100 of file docqual.cpp.

{
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return;
DocQualCallbacks cb(word);
*word->rebuild_word,
*match_count = cb.match_count;
*accepted_match_count = cb.accepted_match_count;
}
BOOL8 tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 673 of file reject.cpp.

{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
(word_lengths[i] != 1 || word[offset] != '1'))
return TRUE;
}
return FALSE;
}
CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
inT16 delete_mode 
)

Definition at line 904 of file docqual.cpp.

{
int word_len = word->reject_map.length ();
float rating_per_ch;
TBOX box; //BB of word
if (word->unlv_crunch_mode == CR_NONE) {
delete_mode = 0;
return CR_NONE;
}
if (word_len == 0) {
delete_mode = 1;
return CR_DELETE;
}
if (word->rebuild_word != NULL) {
// Cube leaves rebuild_word NULL.
box = word->rebuild_word->bounding_box();
delete_mode = 4;
return CR_DELETE;
}
delete_mode = 5;
return CR_DELETE;
}
}
if ((failure_count (word) * 1.5) > word_len) {
delete_mode = 2;
}
delete_mode = 7;
}
rating_per_ch = word->best_choice->rating () / word_len;
if (rating_per_ch > crunch_del_rating) {
delete_mode = 8;
}
delete_mode = 9;
}
if (box.bottom () >
delete_mode = 10;
}
delete_mode = 11;
}
delete_mode = 3;
}
delete_mode = 0;
return CR_NONE;
}
BOOL8 tesseract::Tesseract::word_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_display() Word Processor

Display a word according to its display modes

Definition at line 747 of file pgedit.cpp.

{
WERD* word = word_res->word;
TBOX word_bb; // word bounding box
int word_height; // ht of word BB
BOOL8 displayed_something = FALSE;
float shift; // from bot left
C_BLOB_IT c_it; // cblob iterator
if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
BoxWord* box_word = word_res->box_word;
int length = box_word->length();
if (word_res->fontinfo == NULL) return false;
const FontInfo& font_info = *word_res->fontinfo;
for (int i = 0; i < length; ++i) {
switch (color_mode) {
if (box_word->BlobPosition(i) == SP_SUBSCRIPT)
color = ScrollView::RED;
break;
if (box_word->BlobPosition(i) == SP_SUPERSCRIPT)
color = ScrollView::RED;
break;
case CM_ITALIC:
if (font_info.is_italic())
color = ScrollView::RED;
break;
case CM_BOLD:
if (font_info.is_bold())
color = ScrollView::RED;
break;
if (font_info.is_fixed_pitch())
color = ScrollView::RED;
break;
case CM_SERIF:
if (font_info.is_serif())
color = ScrollView::RED;
break;
if (word_res->small_caps)
color = ScrollView::RED;
break;
if (box_word->BlobPosition(i) == SP_DROPCAP)
color = ScrollView::RED;
break;
// TODO(rays) underline is currently completely unsupported.
default:
break;
}
image_win->Pen(color);
TBOX box = box_word->BlobBox(i);
image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
}
return true;
}
/*
Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
etc. are to keep the compiler happy.
*/
// display bounding box
if (word->display_flag(DF_BOX)) {
editor_image_word_bb_color));
c_it.set_to_list(word->cblob_list());
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
c_it.data()->bounding_box().plot(image_win);
displayed_something = TRUE;
}
// display edge steps
if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
word->plot(image_win); // rainbow colors
displayed_something = TRUE;
}
// display poly approx
if (word->display_flag(DF_POLYGONAL)) {
// need to convert
TWERD* tword = TWERD::PolygonalCopy(word);
tword->plot(image_win);
delete tword;
displayed_something = TRUE;
}
// Display correct text and blamer information.
STRING text;
STRING blame;
if (word->display_flag(DF_TEXT) && word->text() != NULL) {
text = word->text();
}
if (word->display_flag(DF_BLAMER) &&
!(word_res->blamer_bundle != NULL &&
text = "";
const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
if (blamer_bundle == NULL) {
text += "NULL";
} else {
for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) {
text += blamer_bundle->truth_text[i];
}
}
text += " -> ";
STRING best_choice_str;
if (word_res->best_choice == NULL) {
best_choice_str = "NULL";
} else {
word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
}
text += best_choice_str;
IncorrectResultReason reason = (blamer_bundle == NULL) ?
blame += " [";
blame += "]";
}
if (text.length() > 0) {
word_bb = word->bounding_box();
word_height = word_bb.height();
int text_height = 0.50 * word_height;
if (text_height > 20) text_height = 20;
image_win->TextAttributes("Arial", text_height, false, false, false);
shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
image_win->Text(word_bb.left() + shift,
word_bb.bottom() + 0.25 * word_height, text.string());
if (blame.length() > 0) {
image_win->Text(word_bb.left() + shift,
word_bb.bottom() + 0.25 * word_height - text_height,
blame.string());
}
displayed_something = TRUE;
}
if (!displayed_something) // display BBox anyway
(ScrollView::Color)((inT32) editor_image_word_bb_color),
editor_image_word_bb_color));
return TRUE;
}
BOOL8 tesseract::Tesseract::word_dumper ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_dumper()

Dump members to the debug window

Definition at line 908 of file pgedit.cpp.

{
if (block != NULL) {
tprintf("\nBlock data...\n");
block->print(NULL, FALSE);
}
tprintf("\nRow data...\n");
row->print(NULL);
tprintf("\nWord data...\n");
word_res->word->print();
if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
tprintf("Current blamer debug: %s\n",
word_res->blamer_bundle->debug.string());
}
return TRUE;
}
inT16 tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 80 of file docqual.cpp.

{
inT16 i = 0;
inT16 err_count = 0;
if (word->rebuild_word != NULL) {
TBLOB* blob = word->rebuild_word->blobs;
for (; blob != NULL; blob = blob->next) {
err_count += count_outline_errs(word->best_choice->unichar_string()[i],
blob->NumOutlines());
i++;
}
}
return err_count;
}
BOOL8 tesseract::Tesseract::word_set_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)
inT16 tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 764 of file fixspace.cpp.

{
float noise_score[512];
int i;
int min_noise_blob; // 1st contender
int max_noise_blob; // last contender
int non_noise_count;
int worst_noise_blob; // Worst blob
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
float non_noise_limit = kBlnXHeight * 0.8;
if (word_res->rebuild_word == NULL)
return -1; // Can't handle cube words.
TBLOB* blob = word_res->rebuild_word->blobs;
// Normalised.
int blob_count = word_res->box_word->length();
ASSERT_HOST(blob_count <= 512);
if (blob_count < 5)
return -1; // too short to split
/* Get the noise scores for all blobs */
#ifndef SECURE_NAMES
tprintf("FP fixspace Noise metrics for \"%s\": ",
#endif
for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
if (word_res->reject_map[i].accepted())
noise_score[i] = non_noise_limit;
else
noise_score[i] = blob_noise_score(blob);
tprintf("%1.1f ", noise_score[i]);
}
tprintf("\n");
/* Now find the worst one which is far enough away from the end of the word */
non_noise_count = 0;
for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit)
return -1;
min_noise_blob = i;
non_noise_count = 0;
for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
i--) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit)
return -1;
max_noise_blob = i;
if (min_noise_blob > max_noise_blob)
return -1;
*worst_noise_score = small_limit;
worst_noise_blob = -1;
for (i = min_noise_blob; i <= max_noise_blob; i++) {
if (noise_score[i] < *worst_noise_score) {
worst_noise_blob = i;
*worst_noise_score = noise_score[i];
}
}
}
void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
BOOL8  force_eol 
)

Definition at line 138 of file output.cpp.

{ // override tilde crunch?
WERD_RES *word = page_res_it.word();
const UNICHARSET &uchset = *word->uch_set;
STRING repetition_code;
const STRING *wordstr;
STRING wordstr_lengths;
int i;
char unrecognised = STRING (unrecognised_char)[0];
char ep_chars[32]; //Only for unlv_tilde_crunch
int ep_chars_index = 0;
char txt_chs[32]; //Only for unlv_tilde_crunch
char map_chs[32]; //Only for unlv_tilde_crunch
int txt_index = 0;
BOOL8 need_reject = FALSE;
UNICHAR_ID space = uchset.unichar_to_id(" ");
if ((word->unlv_crunch_mode != CR_NONE ||
word->best_choice->length() == 0) &&
if ((word->unlv_crunch_mode != CR_DELETE) &&
(word->word->space () > 0) &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)))) {
if (!word->word->flag (W_BOL) &&
(word->word->space () > 0) &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)) {
// Write a space to separate from preceeding good text.
txt_chs[txt_index] = ' ';
map_chs[txt_index++] = '1';
ep_chars[ep_chars_index++] = ' ';
stats_.last_char_was_tilde = false;
}
need_reject = TRUE;
}
if ((need_reject && !stats_.last_char_was_tilde) ||
(force_eol && stats_.write_results_empty_block)) {
/* Write a reject char - mark as rejected unless zero_rejection mode */
txt_chs[txt_index] = unrecognised;
map_chs[txt_index++] = '1';
ep_chars[ep_chars_index++] = unrecognised;
}
else {
map_chs[txt_index++] = '0';
/*
The ep_choice string is a faked reject to allow newdiff to sync the
.etx with the .txt and .map files.
*/
ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
//dummy reject
ep_chars[ep_chars_index++] = 1;
//dummy reject
ep_chars[ep_chars_index++] = 1;
//type
ep_chars[ep_chars_index++] = 2;
//dummy reject
ep_chars[ep_chars_index++] = 1;
//dummy reject
ep_chars[ep_chars_index++] = 1;
}
stats_.tilde_crunch_written = true;
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = false;
}
if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
/* Add a new line output */
txt_chs[txt_index] = '\n';
map_chs[txt_index++] = '\n';
//end line
ep_chars[ep_chars_index++] = newline_type;
//Cos of the real newline
stats_.tilde_crunch_written = false;
stats_.last_char_was_newline = true;
stats_.last_char_was_tilde = false;
}
txt_chs[txt_index] = '\0';
map_chs[txt_index] = '\0';
ep_chars[ep_chars_index] = '\0'; // terminate string
word->ep_choice = new WERD_CHOICE(ep_chars, uchset);
if (force_eol)
return;
}
/* NORMAL PROCESSING of non tilde crunched words */
stats_.tilde_crunch_written = false;
if (newline_type)
stats_.last_char_was_newline = true;
else
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = force_eol; // about to write a real word
(word->word->space() == 0) &&
(word->best_choice->unichar_id(0) == space)) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
if (word->best_choice->blob_choices() != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
}
word->box_word->DeleteBox(0);
}
if (newline_type ||
stats_.last_char_was_tilde = false;
else {
if (word->reject_map.length () > 0) {
if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
stats_.last_char_was_tilde = true;
else
stats_.last_char_was_tilde = false;
}
else if (word->word->space () > 0)
stats_.last_char_was_tilde = false;
/* else it is unchanged as there are no output chars */
}
check_debug_pt (word, 120);
tprintf ("Dict word: \"%s\": %d\n",
dict_word(*(word->best_choice)));
}
repetition_code = "|^~R";
wordstr_lengths = "\001\001\001\001";
repetition_code += uchset.id_to_unichar(get_rep_char(word));
wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
wordstr = &repetition_code;
} else {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if (word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if ((word->best_choice->unichar_id(i) != space) &&
word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}

Member Data Documentation

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 693 of file tesseractclass.h.

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 698 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 702 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 705 of file tesseractclass.h.

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 694 of file tesseractclass.h.

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 692 of file tesseractclass.h.

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no chekcs"

Definition at line 787 of file tesseractclass.h.

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 725 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 726 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 727 of file tesseractclass.h.

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 878 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 814 of file tesseractclass.h.

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 823 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 803 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 808 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 809 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 805 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 804 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 806 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 802 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 793 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 792 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 817 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Dont pot crunch sensible strings"

Definition at line 816 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 819 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Dont touch sensible strings"

Definition at line 813 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 821 of file tesseractclass.h.

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 822 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 797 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 798 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 801 of file tesseractclass.h.

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 812 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 800 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 799 of file tesseractclass.h.

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 811 of file tesseractclass.h.

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 810 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 795 of file tesseractclass.h.

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 794 of file tesseractclass.h.

int tesseract::Tesseract::cube_debug_level = 1

"Print cube debug info."

Definition at line 751 of file tesseractclass.h.

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 724 of file tesseractclass.h.

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 829 of file tesseractclass.h.

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 723 of file tesseractclass.h.

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 755 of file tesseractclass.h.

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 885 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 828 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 825 of file tesseractclass.h.

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 826 of file tesseractclass.h.

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 884 of file tesseractclass.h.

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 894 of file tesseractclass.h.

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 879 of file tesseractclass.h.

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 831 of file tesseractclass.h.

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 684 of file tesseractclass.h.

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 877 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 753 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 752 of file tesseractclass.h.

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 680 of file tesseractclass.h.

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 750 of file tesseractclass.h.

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 729 of file tesseractclass.h.

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 732 of file tesseractclass.h.

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 733 of file tesseractclass.h.

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 731 of file tesseractclass.h.

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 728 of file tesseractclass.h.

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 789 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Dont double check"

Definition at line 868 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 867 of file tesseractclass.h.

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 873 of file tesseractclass.h.

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 866 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 871 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 872 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 869 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 870 of file tesseractclass.h.

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 874 of file tesseractclass.h.

bool tesseract::Tesseract::save_blob_choices = false

"Save the results of the recognition step" " (blob_choices) within the corresponding WERD_CHOICE"

Definition at line 746 of file tesseractclass.h.

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 850 of file tesseractclass.h.

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 848 of file tesseractclass.h.

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 843 of file tesseractclass.h.

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Dont touch bad rating limit"

Definition at line 849 of file tesseractclass.h.

int tesseract::Tesseract::suspect_short_words = 2

"Dont Suspect dict wds longer than this"

Definition at line 847 of file tesseractclass.h.

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 845 of file tesseractclass.h.

int tesseract::Tesseract::tessdata_manager_debug_level = 0

"Debug level for TessdataManager functions."

Definition at line 888 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adapt_to_char_fragments = true

"Adapt to words that contain " " a character composed form fragments"

Definition at line 689 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 691 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 676 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 722 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 672 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 674 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 857 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 880 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 840 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 718 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 784 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 717 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 786 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 706 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 773 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 775 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 708 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 662 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_bigram_correction = false

"Enable correction based on the word bigram dictionary."

Definition at line 720 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 716 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 710 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 713 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 861 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 781 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 757 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 875 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 899 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 890 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 863 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 660 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 741 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 739 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 851 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY

"Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" " to loading and running only Tesseract (no Cube, no combiner)." " (Values from OcrEngineMode enum in tesseractclass.h)"

Definition at line 670 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ok_mode = 5

"Acceptance decision algorithm"

Definition at line 859 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 886 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specifc page to process"

Definition at line 882 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 666 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctation joins"

Definition at line 827 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 769 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 777 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 771 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 714 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 783 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 762 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 760 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 858 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 764 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 860 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 654 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 656 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 779 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_tess_adapt_to_rejmap = false

"Use reject map to control Tesseract adaption"

Definition at line 735 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 737 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 740 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 743 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 658 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_training_tess = false

"Call Tess to learn blobs"

Definition at line 707 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Dont bother with word plausibility"

Definition at line 712 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 865 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 758 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 767 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 854 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 836 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 883 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 686 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 838 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 839 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Dont reject ANYTHING AT ALL"

Definition at line 856 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Dont reject ANYTHING"

Definition at line 852 of file tesseractclass.h.

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 747 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 748 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 749 of file tesseractclass.h.

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 900 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 895 of file tesseractclass.h.

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 896 of file tesseractclass.h.

bool tesseract::Tesseract::unlv_tilde_crunching = true

"Mark v.bad words for tilde crunch"

Definition at line 791 of file tesseractclass.h.

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 842 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 833 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 834 of file tesseractclass.h.


The documentation for this class was generated from the following files: