39 #include "config_auto.h"
42 #include "allheaders.h"
49 #define MAX_NEAREST_DIST 600 //for block skew stats
58 int pix_height = pixGetHeight(pix);
60 int width = box.
width();
62 Box* blob_pix_box = boxCreate(box.
left(), pix_height - box.
top(),
64 Pix* pix_blob = pixClipRectangle(pix, blob_pix_box,
NULL);
65 boxDestroy(&blob_pix_box);
66 Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
67 pixDestroy(&pix_blob);
69 uinT32* data = pixGetData(dist_pix);
70 int wpl = pixGetWpl(dist_pix);
72 STATS h_stats(0, width + 1);
73 for (
int y = 0; y < height; ++y) {
74 uinT32* pixels = data + y*wpl;
76 int pixel = GET_DATA_BYTE(pixels, 0);
77 for (
int x = 1; x < width; ++x) {
78 int next_pixel = GET_DATA_BYTE(pixels, x);
81 if (prev_pixel < pixel &&
82 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
83 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
84 if (pixel > next_pixel) {
86 h_stats.
add(pixel * 2 - 1, 1);
87 }
else if (pixel == next_pixel && x + 1 < width &&
88 pixel > GET_DATA_BYTE(pixels, x + 1)) {
90 h_stats.
add(pixel * 2, 1);
98 STATS v_stats(0, height + 1);
99 for (
int x = 0; x < width; ++x) {
101 int pixel = GET_DATA_BYTE(data, x);
102 for (
int y = 1; y < height; ++y) {
103 uinT32* pixels = data + y*wpl;
104 int next_pixel = GET_DATA_BYTE(pixels, x);
107 if (prev_pixel < pixel &&
108 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
109 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
110 if (pixel > next_pixel) {
112 v_stats.
add(pixel * 2 - 1, 1);
113 }
else if (pixel == next_pixel && y + 1 < height &&
114 pixel > GET_DATA_BYTE(pixels + wpl, x)) {
116 v_stats.
add(pixel * 2, 1);
123 pixDestroy(&dist_pix);
130 if (h_stats.
get_total() >= (width + height) / 4) {
132 if (v_stats.
get_total() >= (width + height) / 4)
137 if (v_stats.
get_total() >= (width + height) / 4 ||
158 TO_BLOCK_LIST *port_blocks) {
162 BLOCK_IT block_it = blocks;
164 BLOBNBOX_IT port_box_it;
166 TO_BLOCK_IT port_block_it = port_blocks;
169 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170 block = block_it.data();
174 port_box_it.set_to_list(&port_block->
blobs);
176 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
177 blob = blob_it.extract();
180 port_box_it.add_after_then_move(newblob);
188 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
189 blob = blob_it.extract();
192 port_box_it.add_after_then_move(newblob);
195 port_block_it.add_after_then_move(port_block);
209 TO_BLOCK_LIST *to_blocks) {
210 int width = pixGetWidth(pix);
211 int height = pixGetHeight(pix);
213 tprintf(
"Input image too large! (%d, %d)\n", width, height);
219 BLOCK_IT block_it(blocks);
220 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
221 block_it.forward()) {
222 BLOCK* block = block_it.data();
229 ICOORD page_tr(width, height);
240 TO_BLOCK_LIST *blocks,
242 TO_BLOCK_IT block_it = blocks;
245 #ifndef GRAPHICS_DISABLED
248 #endif // GRAPHICS_DISABLED
250 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
251 block_it.forward()) {
252 block = block_it.data();
265 #ifndef GRAPHICS_DISABLED
279 #endif // GRAPHICS_DISABLED
289 float Textord::filter_noise_blobs(
290 BLOBNBOX_LIST *src_list,
291 BLOBNBOX_LIST *noise_list,
292 BLOBNBOX_LIST *small_list,
293 BLOBNBOX_LIST *large_list) {
298 BLOBNBOX_IT src_it = src_list;
299 BLOBNBOX_IT noise_it = noise_list;
300 BLOBNBOX_IT small_it = small_list;
301 BLOBNBOX_IT large_it = large_list;
309 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
310 blob = src_it.data ();
312 noise_it.add_after_then_move (src_it.extract ());
315 small_it.add_after_then_move (src_it.extract ());
317 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
318 size_stats.add (src_it.data ()->bounding_box ().height (), 1);
321 max_y = ceil(initial_x *
326 min_y = floor (initial_x / 2);
328 small_it.move_to_first ();
329 for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
330 small_it.forward ()) {
331 height = small_it.data()->bounding_box().height();
333 large_it.add_after_then_move(small_it.extract ());
334 else if (height >= min_y)
335 src_it.add_after_then_move(small_it.extract ());
338 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
339 height = src_it.data ()->bounding_box ().height ();
340 width = src_it.data ()->bounding_box ().width ();
342 small_it.add_after_then_move (src_it.extract ());
343 else if (height > max_y || width > max_x)
344 large_it.add_after_then_move (src_it.extract ());
346 size_stats.add (height, 1);
352 if (max_height > initial_x)
353 initial_x = max_height;
364 void Textord::cleanup_blocks(
367 BLOCK_IT block_it = blocks;
371 int num_rows_all = 0;
373 int num_blocks_all = 0;
374 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
375 block_it.forward ()) {
378 row_it.set_to_list (block_it.data ()->row_list ());
379 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
381 clean_small_noise_from_words(row_it.data());
383 && !row_it.data ()->word_list ()->empty ()
384 && clean_noise_from_row (row_it.data ()))
385 || row_it.data ()->word_list ()->empty ())
386 delete row_it.extract ();
389 clean_noise_from_words (row_it.data ());
397 if (block_it.data()->row_list()->empty() &&
398 (block_it.data()->poly_block() ==
NULL ||
399 block_it.data()->poly_block()->IsText())) {
400 delete block_it.extract();
406 tprintf(
"cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
409 tprintf(
"cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
419 BOOL8 Textord::clean_noise_from_row(
428 inT32 trans_count = 0;
429 inT32 trans_threshold;
432 inT32 super_norm_count;
446 super_norm_count = 0;
447 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
448 word = word_it.data ();
451 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
452 blob_it.forward ()) {
453 blob = blob_it.data ();
456 out_it.set_to_list (blob->
out_list ());
457 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
459 outline = out_it.data ();
465 if (blob_size < textord_noise_sizelimit * row->x_height ())
467 if (!outline->
child ()->empty ()
472 && blob_box.
width () <
474 && blob_box.
width () >
486 && blob_size < row->
x_height () * 2) {
493 && (!word_it.at_first () || !blob_it.at_first ()))
498 (
"Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
500 blob_box.
top (), blob->
out_list ()->length (), trans_count,
508 tprintf (
"Row ending at (%d,%g):",
510 tprintf (
" R=%g, dc=%d, nc=%d, %s\n",
511 norm_count > 0 ? (
float) dot_count / norm_count : 9999,
512 dot_count, norm_count,
514 && dot_count > 2 ?
"REJECTED" :
"ACCEPTED");
527 void Textord::clean_noise_from_words(
537 inT32 trans_threshold;
548 ok_words = word_it.length ();
555 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
556 word = word_it.data ();
561 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
562 blob_it.forward ()) {
563 blob = blob_it.data ();
566 out_it.set_to_list (blob->
out_list ());
567 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
569 outline = out_it.data ();
575 if (blob_size < textord_noise_sizelimit * row->x_height ())
577 if (!outline->
child ()->empty ()
582 && blob_box.
width () <
584 && blob_box.
width () >
596 && blob_size < row->
x_height () * 2) {
603 && (!word_it.at_first () || !blob_it.at_first ()))
608 word_dud[word_index] = 2;
609 else if (dot_count > norm_count * textord_noise_normratio)
610 word_dud[word_index] = 1;
612 word_dud[word_index] = 0;
615 word_dud[word_index] = 0;
616 if (word_dud[word_index] == 2)
624 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
625 if (word_dud[word_index] == 2
626 || (word_dud[word_index] == 1 && dud_words > ok_words)) {
627 word = word_it.data ();
640 void Textord::clean_small_noise_from_words(
ROW *row) {
642 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
643 WERD* word = word_it.data();
644 int min_size =
static_cast<int>(
647 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
648 C_BLOB* blob = blob_it.data();
649 C_OUTLINE_IT out_it(blob->
out_list());
650 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
655 delete blob_it.extract();
659 if (!word_it.at_last()) {
662 WERD* next_word = word_it.data_relative(1);
667 delete word_it.extract();
681 double blshift_maxshift,
682 double blshift_xfraction) {
698 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
699 word = word_it.data ();
709 (
double *)
alloc_mem ((blob_count + row->baseline.segments) * 3 *
714 xstarts[0] = row->baseline.xcoords[0];
715 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
716 word = word_it.data ();
719 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
720 blob_it.forward ()) {
721 blob = blob_it.data ();
723 x_centre = (blob_box.
left () + blob_box.
right ()) / 2.0;
729 if (ydiff < blshift_maxshift
731 if (xstarts[dest_index] >= x_centre)
732 xstarts[dest_index] = blob_box.
left ();
733 coeffs[dest_index * 3] = 0;
734 coeffs[dest_index * 3 + 1] = 0;
735 coeffs[dest_index * 3 + 2] = blob_box.
bottom ();
738 xstarts[dest_index] = blob_box.
right () + 1;
741 if (xstarts[dest_index] <= x_centre) {
742 while (row->baseline.xcoords[src_index + 1] <= x_centre
743 && src_index < row->
baseline.segments - 1) {
744 if (row->baseline.xcoords[src_index + 1] >
745 xstarts[dest_index]) {
746 coeffs[dest_index * 3] =
747 row->baseline.quadratics[src_index].
a;
748 coeffs[dest_index * 3 + 1] =
749 row->baseline.quadratics[src_index].
b;
750 coeffs[dest_index * 3 + 2] =
751 row->baseline.quadratics[src_index].
c;
753 xstarts[dest_index] =
754 row->baseline.xcoords[src_index + 1];
758 coeffs[dest_index * 3] =
759 row->baseline.quadratics[src_index].
a;
760 coeffs[dest_index * 3 + 1] =
761 row->baseline.quadratics[src_index].
b;
762 coeffs[dest_index * 3 + 2] =
763 row->baseline.quadratics[src_index].
c;
765 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
770 while (src_index < row->
baseline.segments
771 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
773 while (src_index < row->
baseline.segments) {
774 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].
a;
775 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].
b;
776 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].
c;
779 xstarts[dest_index] = row->baseline.xcoords[src_index];
782 row->baseline =
QSPLINE (dest_index, xstarts, coeffs);