Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
pageres.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.h (Formerly page_res.h)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  * Created: Tue Sep 22 08:42:49 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #ifndef PAGERES_H
20 #define PAGERES_H
21 
22 #include "blobs.h"
23 #include "boxword.h"
24 #include "elst.h"
25 #include "genericvector.h"
26 #include "normalis.h"
27 #include "ocrblock.h"
28 #include "ocrrow.h"
30 #include "ratngs.h"
31 #include "rejctmap.h"
32 #include "seam.h"
33 #include "werd.h"
34 
35 namespace tesseract {
36 struct FontInfo;
37 class Tesseract;
38 }
40 
41 static const inT16 kBlamerBoxTolerance = 5;
42 
43 // Enum for expressing the source of error.
44 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
46  // The text recorded in best choice == truth text
48  // Either: Top choice is incorrect and is a dictionary word (language model
49  // is unlikely to help correct such errors, so blame the classifier).
50  // Or: the correct unichar was not included in shortlist produced by the
51  // classifier at all.
53  // Chopper have not found one or more splits that correspond to the correct
54  // character bounding boxes recorded in BlamerBundle::truth_word.
56  // Classifier did include correct unichars for each blob in the correct
57  // segmentation, however its rating could have been too bad to allow the
58  // language model to pull out the correct choice. On the other hand the
59  // strength of the language model might have been too weak to favor the
60  // correct answer, this we call this case a classifier-language model
61  // tradeoff error.
63  // Page layout failed to produce the correct bounding box. Blame page layout
64  // if the truth was not found for the word, which implies that the bounding
65  // box of the word was incorrect (no truth word had a similar bounding box).
67  // SegSearch heuristic prevented one or more blobs from the correct
68  // segmentation state to be classified (e.g. the blob was too wide).
70  // The correct segmentaiton state was not explored because of poor SegSearch
71  // pain point prioritization. We blame SegSearch pain point prioritization
72  // if the best rating of a choice constructed from correct segmentation is
73  // better than that of the best choice (i.e. if we got to explore the correct
74  // segmentation state, language model would have picked the correct choice).
76  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
77 
78  // and thus use the old language model (permuters).
79  // TODO(antonova): integrate the new language mode with chopper
81  // If there is an incorrect adaptive template match with a better score than
82  // a correct one (either pre-trained or adapted), mark this as adaption error.
84  // split_and_recog_word() failed to find a suitable split in truth.
86  // Truth is not available for this word (e.g. when words in corrected content
87  // file are turned into ~~~~ because an appropriate alignment was not found.
89  // The text recorded in best choice != truth text, but none of the above
90  // reasons are set.
92 
94 };
95 
96 // Blamer-related information to determine the source of errors.
97 struct BlamerBundle {
98  static const char *IncorrectReasonName(IncorrectResultReason irr);
102  ~BlamerBundle() { delete[] lattice_data; }
103  void ClearResults() {
105  norm_box_tolerance = 0;
107  debug = "";
113  delete[] lattice_data;
114  lattice_data = NULL;
115  lattice_size = 0;
116  }
117  void CopyTruth(const BlamerBundle &other) {
119  truth_word = other.truth_word;
120  truth_text = other.truth_text;
122  (other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
123  }
124  void CopyResults(const BlamerBundle &other) {
134  if (other.lattice_data != NULL) {
135  lattice_data = new char[other.lattice_size];
136  memcpy(lattice_data, other.lattice_data, other.lattice_size);
137  lattice_size = other.lattice_size;
138  } else {
139  lattice_data = NULL;
140  }
141  }
142  BlamerBundle(const BlamerBundle &other) {
143  this->CopyTruth(other);
144  this->CopyResults(other);
145  }
146  const char *IncorrectReason() const;
147  bool NoTruth() const {
150  }
152  const STRING &msg, const WERD_CHOICE *choice, bool debug) {
153  this->incorrect_result_reason = irr;
154  this->debug = this->IncorrectReason();
155  this->debug += " to blame: ";
156  this->FillDebugString(msg, choice, &(this->debug));
157  if (debug) tprintf("SetBlame(): %s", this->debug.string());
158  }
159  // Appends choice and truth details to the given debug string.
160  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
161  STRING *debug);
162 
163  // Set to true when bounding boxes for individual unichars are recorded.
165  // The true_word (in the original image coordinate space) contains ground
166  // truth bounding boxes for this WERD_RES.
168  // Same as above, but in normalized coordinates
169  // (filled in by WERD_RES::SetupForRecognition()).
171  // Tolerance for bounding box comparisons in normalized space.
173  // Contains ground truth unichar for each of the bounding boxes in truth_word.
175  // The reason for incorrect OCR result.
177  // Debug text associated with the blame.
179  // Misadaption debug information (filled in if this word was misadapted to).
181  // Variables used by the segmentation search when looking for the blame.
182  // Set to true while segmentation search is continued after the usual
183  // termination condition in order to look for the blame.
185  // Best rating for correctly segmented path
186  // (set and used by SegSearch when looking for blame).
188  // Vectors populated by SegSearch to indicate column and row indices that
189  // correspond to blobs with correct bounding boxes.
192  // Set to true if best choice is a dictionary word and
193  // classifier's top choice.
195  // Serialized segmentation search lattice.
197  int lattice_size; // size of lattice_data in bytes
198  // Information about hypotheses (paths) explored by the segmentation search.
200 };
201 
202 /* Forward declarations */
203 
204 class BLOCK_RES;
205 
207 class
208 ROW_RES;
209 
210 ELISTIZEH (ROW_RES)
211 class WERD_RES;
212 
213 ELISTIZEH (WERD_RES)
214 
215 /*************************************************************************
216  * PAGE_RES - Page results
217  *************************************************************************/
218 class PAGE_RES { // page result
219  public:
222  BLOCK_RES_LIST block_res_list;
224  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
225  // the next word. This pointer is not owned by PAGE_RES class.
227  // Sums of blame reasons computed by the blamer.
229  // Debug information about all the misadaptions on this page.
230  // Each BlamerBundle contains an index into this vector, so that words that
231  // caused misadaption could be marked. However, since words could be
232  // deleted/split/merged, the log is stored on the PAGE_RES level.
234 
235  inline void Init() {
236  char_count = 0;
237  rej_count = 0;
238  rejected = FALSE;
239  prev_word_best_choice = NULL;
240  blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
241  }
242 
243  PAGE_RES() { Init(); } // empty constructor
244 
245  PAGE_RES(BLOCK_LIST *block_list, // real blocks
246  WERD_CHOICE **prev_word_best_choice_ptr);
247 
248  ~PAGE_RES () { // destructor
249  }
250 };
251 
252 /*************************************************************************
253  * BLOCK_RES - Block results
254  *************************************************************************/
255 
256 class BLOCK_RES:public ELIST_LINK {
257  public:
258  BLOCK * block; // real block
259  inT32 char_count; // chars in block
260  inT32 rej_count; // rejected chars
263  float x_height;
264  BOOL8 font_assigned; // block already
265  // processed
266  BOOL8 bold; // all bold
267  BOOL8 italic; // all italic
268 
269  ROW_RES_LIST row_res_list;
270 
272  } // empty constructor
273 
274  BLOCK_RES(BLOCK *the_block); // real block
275 
276  ~BLOCK_RES () { // destructor
277  }
278 };
279 
280 /*************************************************************************
281  * ROW_RES - Row results
282  *************************************************************************/
283 
284 class ROW_RES:public ELIST_LINK {
285  public:
286  ROW * row; // real row
287  inT32 char_count; // chars in block
288  inT32 rej_count; // rejected chars
289  inT32 whole_word_rej_count; // rejs in total rej wds
290  WERD_RES_LIST word_res_list;
291 
293  } // empty constructor
294 
295  ROW_RES(ROW *the_row); // real row
296 
297  ~ROW_RES() { // destructor
298  }
299 };
300 
301 /*************************************************************************
302  * WERD_RES - Word results
303  *************************************************************************/
305 {
310 };
311 
312 // WERD_RES is a collection of publicly accessible members that gathers
313 // information about a word result.
314 class WERD_RES : public ELIST_LINK {
315  public:
316  // Which word is which?
317  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
318  // the original image coordinate space, and the BLN space in which the
319  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
320  // and the x-middle of the word is at 0.
321  // In the rotated pixel space, coordinates correspond to the input image,
322  // but may be rotated about the origin by a multiple of 90 degrees,
323  // and may therefore be negative.
324  // In any case a rotation by denorm.block()->re_rotation() will take them
325  // back to the original image.
326  // The other differences between words all represent different stages of
327  // processing during recognition.
328 
329  // ---------------------------INPUT-------------------------------------
330 
331  // The word is the input C_BLOBs in the rotated pixel space.
332  // word is NOT owned by the WERD_RES unless combination is true.
333  // All the other word pointers ARE owned by the WERD_RES.
334  WERD* word; // Input C_BLOB word.
335 
336  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
337 
338  // The bln_boxes contains the bounding boxes (only) of the input word, in the
339  // BLN space. The lengths of word and bln_boxes
340  // match as they are both before any chopping.
341  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
342  // if it doesn't.
343  tesseract::BoxWord* bln_boxes; // BLN input bounding boxes.
344  // The denorm provides the transformation to get back to the rotated image
345  // coords from the chopped_word/rebuild_word BLN coords.
346  DENORM denorm; // For use on chopped_word.
347  // Unicharset used by the classifier output in best_choice and raw_choice.
348  const UNICHARSET* uch_set; // For converting back to utf8.
349 
350  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
351  // ----Setup to a (different!) state expected by the various classifiers----
352  // TODO(rays) Tidy and make more consistent.
353 
354  // The chopped_word is also in BLN space, and represents the fully chopped
355  // character fragments that make up the word.
356  // The length of chopped_word matches length of seam_array + 1 (if set).
357  TWERD* chopped_word; // BLN chopped fragments output.
358  SEAMS seam_array; // Seams matching chopped_word.
359  WERD_CHOICE *best_choice; // tess output
360  WERD_CHOICE *raw_choice; // top choice permuter
361  // Alternative paths found during chopping/segmentation search stages
362  // (the first entry being a slim copy of best_choice).
365 
366  // Truth bounding boxes, text and incorrect choice reason.
368 
369  // --------------OUTPUT FROM RECOGNITION-------------------------------
370  // --------------Not all fields are necessarily set.-------------------
371  // ---best_choice, raw_choice *must* end up set, with a box_word-------
372  // ---In complete output, the number of blobs in rebuild_word matches---
373  // ---the number of boxes in box_word, the number of unichar_ids in---
374  // ---best_choice, the number of ints in best_state, and the number---
375  // ---of strings in correct_text--------------------------------------
376  // ---SetupFake Sets everything to appropriate values if the word is---
377  // ---known to be bad before recognition.------------------------------
378 
379  // The rebuild_word is also in BLN space, but represents the final best
380  // segmentation of the word. Its length is therefore the same as box_word.
381  TWERD* rebuild_word; // BLN best segmented word.
382  // The box_word is in the original image coordinate space. It is the
383  // bounding boxes of the rebuild_word, after denormalization.
384  // The length of box_word matches rebuild_word, best_state (if set) and
385  // correct_text (if set), as well as best_choice and represents the
386  // number of classified units in the output.
387  tesseract::BoxWord* box_word; // Denormalized output boxes.
388  // The best_state stores the relationship between chopped_word and
389  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
390  // adjacent blobs in chopped_word. The seams in seam_array are hidden
391  // within a rebuild_word blob and revealed between them.
392  GenericVector<int> best_state; // Number of blobs in each best blob.
393  // The correct_text is used during training and adaption to carry the
394  // text to the training system without the need for a unicharset. There
395  // is one entry in the vector for each blob in rebuild_word and box_word.
397  // The Tesseract that was used to recognize this word. Just a borrowed
398  // pointer. Note: Tesseract's class definition is in a higher-level library.
399  // We avoid introducing a cyclic dependency by not using the Tesseract
400  // within WERD_RES. We are just storing it to provide access to it
401  // for the top-level multi-language controller, and maybe for output of
402  // the recognized language.
404 
405  // Less-well documented members.
406  // TODO(rays) Add more documentation here.
407  WERD_CHOICE *ep_choice; // ep text TODO(rays) delete this.
408  REJMAP reject_map; // best_choice rejects
410  /*
411  If tess_failed is TRUE, one of the following tests failed when Tess
412  returned:
413  - The outword blob list was not the same length as the best_choice string;
414  - The best_choice string contained ALL blanks;
415  - The best_choice string was zero length
416  */
417  BOOL8 tess_accepted; // Tess thinks its ok?
418  BOOL8 tess_would_adapt; // Tess would adapt?
419  BOOL8 done; // ready for output?
420  bool small_caps; // word appears to be small caps
423  // The fontinfos are pointers to data owned by the classifier.
426  inT8 fontinfo_id_count; // number of votes
427  inT8 fontinfo_id2_count; // number of votes
431  float x_height; // post match estimate
432  float caps_height; // post match estimate
433 
434  /*
435  To deal with fuzzy spaces we need to be able to combine "words" to form
436  combinations when we suspect that the gap is a non-space. The (new) text
437  ord code generates separate words for EVERY fuzzy gap - flags in the word
438  indicate whether the gap is below the threshold (fuzzy kern) and is thus
439  NOT a real word break by default, or above the threshold (fuzzy space) and
440  this is a real word break by default.
441 
442  The WERD_RES list contains all these words PLUS "combination" words built
443  out of (copies of) the words split by fuzzy kerns. The separate parts have
444  their "part_of_combo" flag set true and should be IGNORED on a default
445  reading of the list.
446 
447  Combination words are FOLLOWED by the sequence of part_of_combo words
448  which they combine.
449  */
450  BOOL8 combination; //of two fuzzy gap wds
451  BOOL8 part_of_combo; //part of a combo
452  BOOL8 reject_spaces; //Reject spacing?
453  // FontInfo ids for each unichar in best_choice.
455 
457  InitNonPointers();
458  InitPointers();
459  }
460  WERD_RES(WERD *the_word) {
461  InitNonPointers();
462  InitPointers();
463  word = the_word;
464  }
465  WERD_RES(const WERD_RES &source) {
466  InitPointers();
467  *this = source; // see operator=
468  }
469 
470  ~WERD_RES();
471 
472  // Returns the UTF-8 string for the given blob index in the best_choice word,
473  // given that we know whether we are in a right-to-left reading context.
474  // This matters for mirrorable characters such as parentheses. We recognize
475  // characters purely based on their shape on the page, and by default produce
476  // the corresponding unicode for a left-to-right context.
477  const char* const BestUTF8(int blob_index, bool in_rtl_context) const {
478  if (blob_index < 0 || blob_index >= best_choice->length())
479  return NULL;
480  UNICHAR_ID id = best_choice->unichar_id(blob_index);
481  if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
482  return NULL;
483  UNICHAR_ID mirrored = uch_set->get_mirror(id);
484  if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID)
485  id = mirrored;
486  return uch_set->id_to_unichar_ext(id);
487  }
488  // Returns the UTF-8 string for the given blob index in the raw_choice word.
489  const char* const RawUTF8(int blob_index) const {
490  if (blob_index < 0 || blob_index >= raw_choice->length())
491  return NULL;
492  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
493  if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
494  return NULL;
495  return uch_set->id_to_unichar(id);
496  }
497 
498  UNICHARSET::Direction SymbolDirection(int blob_index) const {
499  if (best_choice == NULL ||
500  blob_index >= best_choice->length() ||
501  blob_index < 0)
503  return uch_set->get_direction(best_choice->unichar_id(blob_index));
504  }
505 
506  bool AnyRtlCharsInWord() const {
507  if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
508  return false;
509  for (int id = 0; id < best_choice->length(); id++) {
510  int unichar_id = best_choice->unichar_id(id);
511  if (unichar_id < 0 || unichar_id >= uch_set->size())
512  continue; // Ignore illegal chars.
514  uch_set->get_direction(unichar_id);
515  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
518  return true;
519  }
520  return false;
521  }
522 
523  bool AnyLtrCharsInWord() const {
524  if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
525  return false;
526  for (int id = 0; id < best_choice->length(); id++) {
527  int unichar_id = best_choice->unichar_id(id);
528  if (unichar_id < 0 || unichar_id >= uch_set->size())
529  continue; // Ignore illegal chars.
530  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
531  if (dir == UNICHARSET::U_LEFT_TO_RIGHT)
532  return true;
533  }
534  return false;
535  }
536 
537  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
538  // that gave us the unichars in reading order (as opposed to strict left
539  // to right).
540  bool UnicharsInReadingOrder() const {
542  }
543 
544  void InitNonPointers();
545  void InitPointers();
546  void Clear();
547  void ClearResults();
548 
549  WERD_RES& operator=(const WERD_RES& source); //from this
550 
551  void CopySimpleFields(const WERD_RES& source);
552 
553  // Initializes a blank (default constructed) WERD_RES from one that has
554  // already been recognized.
555  // Use SetupFor*Recognition afterwards to complete the setup and make
556  // it ready for a retry recognition.
557  void InitForRetryRecognition(const WERD_RES& source);
558 
559  // Sets up the members used in recognition: bln_boxes, chopped_word,
560  // seam_array, denorm, best_choice, raw_choice. Returns false if
561  // the word is empty and sets up fake results. If use_body_size is
562  // true and row->body_size is set, then body_size will be used for
563  // blob normalization instead of xheight + ascrise. This flag is for
564  // those languages that are using CJK pitch model and thus it has to
565  // be true if and only if tesseract->textord_use_cjk_fp_model is
566  // true.
567  bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
568  tesseract::Tesseract* tesseract, Pix* pix,
569  bool numeric_mode, bool use_body_size,
570  ROW *row, BLOCK* block);
571 
572  // Sets up the members used in recognition:
573  // bln_boxes, chopped_word, seam_array, denorm.
574  // Returns false if the word is empty and sets up fake results.
575  bool SetupForCubeRecognition(const UNICHARSET& unicharset_in,
577  const BLOCK* block);
578 
579  // Sets up the members used in recognition for an empty recognition result:
580  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
581  void SetupFake(const UNICHARSET& uch);
582 
583  // Set the word as having the script of the input unicharset.
584  void SetupWordScript(const UNICHARSET& unicharset_in);
585 
586  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
587  void SetupBlamerBundle();
588 
589  // Moves the results fields from word to this. This takes ownership of all
590  // the data, so src can be destructed.
591  // word1.ConsumeWordResult(word);
592  // delete word;
593  // is simpler and faster than:
594  // word1 = *word;
595  // delete word;
596  // as it doesn't need to copy and reallocate anything.
598 
599  // Replace the best choice and rebuild box word.
600  void ReplaceBestChoice(const WERD_CHOICE& choice,
601  const GenericVector<int> &segmentation_state);
602 
603  // Builds the rebuild_word from the chopped_word and the best_state.
604  void RebuildBestState();
605 
606  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
607  // Also sets up the output box_word.
608  void CloneChoppedToRebuild();
609 
610  // Sets/replaces the box_word with one made from the rebuild_word.
611  void SetupBoxWord();
612 
613  // Sets up the script positions in the output boxword using the best_choice
614  // to get the unichars, and the unicharset to get the target positions.
615  void SetScriptPositions();
616 
617  // Returns the indices [start, end) containing the core of the word, stripped
618  // of any superscript digits on either side.
619  // (i.e., the non-footnote part of the word).
620  // Assumes that BoxWord is all set up for best_choice.
621  void WithoutFootnoteSpan(int *start, int *end) const;
622 
623  // Given an alternate word choice and segmentation state, yield the indices
624  // [start, end) containig the core of the word, stripped of any superscript
625  // digits on either side. (i.e. stripping off the footnote parts).
626  void WithoutFootnoteSpan(
627  const WERD_CHOICE &choice, const GenericVector<int> &state,
628  int *start, int *end) const;
629 
630  // Classifies the word with some already-calculated BLOB_CHOICEs.
631  // The choices are an array of blob_count pointers to BLOB_CHOICE,
632  // providing a single classifier result for each blob.
633  // The BLOB_CHOICEs are consumed and the word takes ownership.
634  // The number of blobs in the outword must match blob_count.
635  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
636 
637  // Copies the best_choice strings to the correct_text for adaption/training.
639 
640  // Merges 2 adjacent blobs in the result if the permanent callback
641  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
642  // callback box_cb is NULL or returns true, setting the merged blob
643  // result to the class returned from class_cb.
644  // Returns true if anything was merged.
648  BLOB_CHOICE_LIST_CLIST *blob_choices);
649 
650  // Callback helper for fix_quotes returns a double quote if both
651  // arguments are quote, otherwise INVALID_UNICHAR_ID.
653  void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
654 
655  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
656  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
658  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
659  // (assuming both on the same textline, are in order and a chopped em dash.)
660  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
661  void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
662 
663  // Callback helper for merge_tess_fails returns a space if both
664  // arguments are space, otherwise INVALID_UNICHAR_ID.
666  void merge_tess_fails();
667 
668  static WERD_RES* deep_copy(const WERD_RES* src) {
669  return new WERD_RES(*src);
670  }
671 
672  // Copy blobs from word_res onto this word (eliminating spaces between).
673  // Since this may be called bidirectionally OR both the BOL and EOL flags.
674  void copy_on(WERD_RES *word_res) { //from this word
675  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
676  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
677  word->copy_on(word_res->word);
678  }
679 
680  // Returns true if the collection of count pieces, starting at start, are all
681  // natural connected components, ie there are no real chops involved.
682  bool PiecesAllNatural(int start, int count) const;
683 };
684 
685 /*************************************************************************
686  * PAGE_RES_IT - Page results iterator
687  *************************************************************************/
688 
689 class PAGE_RES_IT {
690  public:
691  PAGE_RES * page_res; // page being iterated
692 
694  } // empty contructor
695 
696  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
697  page_res = the_page_res;
698  restart_page(); // ready to scan
699  }
700 
701  // Do two PAGE_RES_ITs point at the same word?
702  // This is much cheaper than cmp().
703  bool operator ==(const PAGE_RES_IT &other) const;
704 
705  bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
706 
707  // Given another PAGE_RES_IT to the same page,
708  // this before other: -1
709  // this equal to other: 0
710  // this later than other: 1
711  int cmp(const PAGE_RES_IT &other) const;
712 
714  return start_page(false); // Skip empty blocks.
715  }
717  return start_page(true); // Allow empty blocks.
718  }
719  WERD_RES *start_page(bool empty_ok);
720 
722 
723  // ============ Methods that mutate the underling structures ===========
724  // Note that these methods will potentially invalidate other PAGE_RES_ITs
725  // and are intended to be used only while a single PAGE_RES_IT is active.
726  // This problem needs to be taken into account if these mutation operators
727  // are ever provided to PageIterator or its subclasses.
728 
729  // Inserts the new_word and a corresponding WERD_RES before the current
730  // position. The simple fields of the WERD_RES are copied from clone_res and
731  // the resulting WERD_RES is returned for further setup with best_choice etc.
732  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
733 
734  // Deletes the current WERD_RES and its underlying WERD.
735  void DeleteCurrentWord();
736 
737  WERD_RES *forward() { // Get next word.
738  return internal_forward(false, false);
739  }
740  // Move forward, but allow empty blocks to show as single NULL words.
742  return internal_forward(false, true);
743  }
744 
745  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
746  WERD_RES *forward_block(); // get first word in next non-empty block
747 
748  WERD_RES *prev_word() const { // previous word
749  return prev_word_res;
750  }
751  ROW_RES *prev_row() const { // row of prev word
752  return prev_row_res;
753  }
754  BLOCK_RES *prev_block() const { // block of prev word
755  return prev_block_res;
756  }
757  WERD_RES *word() const { // current word
758  return word_res;
759  }
760  ROW_RES *row() const { // row of current word
761  return row_res;
762  }
763  BLOCK_RES *block() const { // block of cur. word
764  return block_res;
765  }
766  WERD_RES *next_word() const { // next word
767  return next_word_res;
768  }
769  ROW_RES *next_row() const { // row of next word
770  return next_row_res;
771  }
772  BLOCK_RES *next_block() const { // block of next word
773  return next_block_res;
774  }
775  void rej_stat_word(); // for page/block/row
776 
777  private:
778  void ResetWordIterator();
779  WERD_RES *internal_forward(bool new_block, bool empty_ok);
780 
781  WERD_RES * prev_word_res; // previous word
782  ROW_RES *prev_row_res; // row of prev word
783  BLOCK_RES *prev_block_res; // block of prev word
784 
785  WERD_RES *word_res; // current word
786  ROW_RES *row_res; // row of current word
787  BLOCK_RES *block_res; // block of cur. word
788 
789  WERD_RES *next_word_res; // next word
790  ROW_RES *next_row_res; // row of next word
791  BLOCK_RES *next_block_res; // block of next word
792 
793  BLOCK_RES_IT block_res_it; // iterators
794  ROW_RES_IT row_res_it;
795  WERD_RES_IT word_res_it;
796 };
797 #endif