17 #include "allheaders.h"
39 TrainingSampleSet::FontClassInfo::FontClassInfo()
40 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0
f) {
44 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp)
const {
45 if (fwrite(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1)
47 if (fwrite(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1)
49 if (fwrite(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1)
return false;
50 if (!samples.Serialize(fp))
return false;
55 bool TrainingSampleSet::FontClassInfo::DeSerialize(
bool swap, FILE* fp) {
56 if (fread(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1)
58 if (fread(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1)
60 if (fread(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1)
return false;
61 if (!samples.DeSerialize(swap, fp))
return false;
63 ReverseN(&num_raw_samples,
sizeof(num_raw_samples));
64 ReverseN(&canonical_sample,
sizeof(canonical_sample));
65 ReverseN(&canonical_dist,
sizeof(canonical_dist));
71 : num_raw_samples_(0), unicharset_size_(0),
72 font_class_array_(
NULL), fontinfo_table_(font_table) {
76 delete font_class_array_;
81 if (!samples_.
Serialize(fp))
return false;
83 if (!font_id_map_.
Serialize(fp))
return false;
84 inT8 not_null = font_class_array_ !=
NULL;
85 if (fwrite(¬_null,
sizeof(not_null), 1, fp) != 1)
return false;
96 num_raw_samples_ = samples_.
size();
98 if (!font_id_map_.
DeSerialize(swap, fp))
return false;
99 if (font_class_array_ !=
NULL) {
100 delete font_class_array_;
101 font_class_array_ =
NULL;
104 if (fread(¬_null,
sizeof(not_null), 1, fp) != 1)
return false;
110 unicharset_size_ = unicharset_.
size();
117 tprintf(
"Failed to load unicharset from file %s\n"
118 "Building unicharset for boosting from scratch...\n",
124 unicharset_size_ = unicharset_.
size();
134 tprintf(
"Error: Size of unicharset in TrainingSampleSet::AddSample is "
135 "greater than MAX_NUM_CLASSES\n");
149 num_raw_samples_ = samples_.
size();
150 unicharset_size_ = unicharset_.
size();
158 bool randomize)
const {
160 if (font_id < 0 || class_id < 0 ||
161 font_id >= font_id_map_.
SparseSize() || class_id >= unicharset_size_) {
169 return (*font_class_array_)(font_index, class_id).samples.size();
176 return samples_[index];
185 if (font_index < 0)
return NULL;
186 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
187 return samples_[sample_index];
196 if (font_index < 0)
return NULL;
197 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
198 return samples_[sample_index];
207 return STRING(fontinfo_table_.
get(sample.
font_id()).name) +
" " + boxfile_str;
213 int font_id,
int class_id)
const {
216 return (*font_class_array_)(font_index, class_id).cloud_features;
221 int font_id,
int class_id)
const {
224 return (*font_class_array_)(font_index, class_id).canonical_features;
239 double dist_sum = 0.0;
244 for (
int i = 0; i < num_fonts1; ++i) {
246 for (
int j = 0; j < num_fonts2; ++j) {
256 for (
int i = 0; i < num_fonts1; ++i) {
258 for (
int j = 0; j < num_fonts2; ++j) {
262 tprintf(
"Cluster dist %d %d %d %d = %g\n",
275 for (
int i = 0; i <
num_samples; ++i, index += increment) {
276 int f1 = uf1.
font_ids[i % num_fonts1];
277 int f2 = uf2.
font_ids[index % num_fonts2];
279 tprintf(
"Cluster dist %d %d %d %d = %g\n",
286 if (dist_count == 0) {
291 return dist_sum / dist_count;
298 int font_id2,
int class_id2,
303 if (font_index1 < 0 || font_index2 < 0)
305 FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
306 if (font_id1 == font_id2) {
308 if (fc_info.unichar_distance_cache.size() == 0)
309 fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
310 if (fc_info.unichar_distance_cache[class_id2] < 0) {
315 fc_info.unichar_distance_cache[class_id2] = result;
317 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
318 if (fc_info2.unichar_distance_cache.size() == 0)
319 fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
320 fc_info2.unichar_distance_cache[class_id1] = result;
322 return fc_info.unichar_distance_cache[class_id2];
323 }
else if (class_id1 == class_id2) {
325 if (fc_info.font_distance_cache.size() == 0)
326 fc_info.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
328 if (fc_info.font_distance_cache[font_index2] < 0) {
333 fc_info.font_distance_cache[font_index2] = result;
335 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
336 if (fc_info2.font_distance_cache.size() == 0)
337 fc_info2.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
339 fc_info2.font_distance_cache[font_index1] = result;
341 return fc_info.font_distance_cache[font_index2];
346 while (cache_index < fc_info.distance_cache.size() &&
347 (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
348 fc_info.distance_cache[cache_index].font_id != font_id2))
350 if (cache_index == fc_info.distance_cache.size()) {
355 FontClassDistance fc_dist = { class_id2, font_id2, result };
356 fc_info.distance_cache.push_back(fc_dist);
359 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
360 fc_dist.unichar_id = class_id1;
361 fc_dist.font_id = font_id1;
362 fc_info2.distance_cache.push_back(fc_dist);
364 return fc_info.distance_cache[cache_index].distance;
369 int font_id1,
int class_id1,
int font_id2,
int class_id2,
377 return static_cast<float>(dist) / denominator;
383 static void AddNearFeatures(
const IntFeatureMap& feature_map,
int f,
int levels,
385 int prev_num_features = 0;
387 int num_features = 1;
388 for (
int level = 0; level < levels; ++level) {
389 for (
int i = prev_num_features; i < num_features; ++i) {
390 int feature = (*good_features)[i];
391 for (
int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
392 if (dir == 0)
continue;
399 prev_num_features = num_features;
400 num_features = good_features->
size();
415 int font_id2,
int class_id2,
417 bool thorough)
const {
425 if (cloud1.
size() == 0)
426 return canonical2.
size();
429 for (
int f = 0; f < canonical2.
size(); ++
f) {
430 int feature = canonical2[
f];
435 AddNearFeatures(feature_map, feature, 1, &good_features);
438 for (i = 0; i < good_features.
size(); ++i) {
439 int good_f = good_features[i];
440 if (cloud1[good_f]) {
444 if (i < good_features.
size())
457 if (font_index < 0)
return -1;
458 return (*font_class_array_)(font_index, class_id).samples[index];
464 int font_id,
int class_id)
const {
467 if (font_index < 0)
return NULL;
468 int sample_index = (*font_class_array_)(font_index,
469 class_id).canonical_sample;
470 return sample_index >= 0 ? samples_[sample_index] :
NULL;
478 if (font_index < 0)
return 0.0f;
479 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
480 return (*font_class_array_)(font_index, class_id).canonical_dist;
487 for (
int s = 0; s < samples_.
size(); ++s)
495 if (font_class_array_ ==
NULL)
499 pixa = pixaCreate(0);
501 int fs_size = feature_space.
Size();
503 for (
int font_index = 0; font_index < font_size; ++font_index) {
504 for (
int c = 0; c < unicharset_size_; ++c) {
508 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
509 int sample_count = fcinfo.samples.size();
512 for (
int i = 0; i < sample_count; ++i) {
513 int s = fcinfo.samples[i];
515 for (
int f = 0; f < features.
size(); ++
f) {
516 ++feature_counts[features[
f]];
519 for (
int i = 0; i < sample_count; ++i) {
520 int s = fcinfo.samples[i];
525 int good_features = 0;
526 int bad_features = 0;
527 for (
int f = 0; f < features.
size(); ++
f) {
528 if (feature_counts[features[f]] > 1)
534 if (bad_features * 2 > good_features) {
535 tprintf(
"Deleting outlier sample of %s, %d good, %d bad\n",
537 good_features, bad_features);
539 pixaAddPix(pixa, sample.
RenderToPix(&unicharset_), L_INSERT);
544 t = fcinfo.samples[1];
546 t = fcinfo.samples[i - 1];
548 pixaAddPix(pixa, csample.
RenderToPix(&unicharset_), L_INSERT);
559 Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
561 pixWrite(
"outliers.png", pix, IFF_PNG);
576 num_raw_samples_ = samples_.
size();
586 static Pix* DebugSample(
const UNICHARSET& unicharset,
588 tprintf(
"\nOriginal features:\n");
593 tprintf(
"\nMapped features:\n");
607 int compact_font_size = font_id_map_.
CompactSize();
609 if (font_class_array_ !=
NULL)
610 delete font_class_array_;
613 compact_font_size, unicharset_size_, empty);
614 for (
int s = 0; s < samples_.
size(); ++s) {
615 int font_id = samples_[s]->font_id();
616 int class_id = samples_[s]->class_id();
617 if (font_id < 0 || font_id >= font_id_map_.
SparseSize()) {
618 tprintf(
"Font id = %d/%d, class id = %d/%d on sample %d\n",
619 font_id, font_id_map_.
SparseSize(), class_id, unicharset_size_,
623 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
625 (*font_class_array_)(font_index, class_id).samples.push_back(s);
629 for (
int f = 0; f < compact_font_size; ++
f) {
630 for (
int c = 0; c < unicharset_size_; ++c)
632 (*font_class_array_)(
f, c).samples.size();
636 num_raw_samples_ = samples_.
size();
644 for (
int s = 0; s < samples_.
size(); ++s) {
645 int font_id = samples_[s]->font_id();
646 while (font_id >= font_counts.
size())
648 ++font_counts[font_id];
650 font_id_map_.
Init(font_counts.
size(),
false);
651 for (
int f = 0; f < font_counts.
size(); ++
f) {
652 font_id_map_.
SetMap(f, font_counts[f] > 0);
654 font_id_map_.
Setup();
669 double global_worst_dist = 0.0;
672 for (
int font_index = 0; font_index < font_size; ++font_index) {
674 for (
int c = 0; c < unicharset_size_; ++c) {
675 int samples_found = 0;
676 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
677 if (fcinfo.samples.size() == 0 ||
679 fcinfo.canonical_sample = -1;
680 fcinfo.canonical_dist = 0.0f;
681 if (debug)
tprintf(
"Skipping class %d\n", c);
686 double min_max_dist = 2.0;
689 double max_max_dist = 0.0;
692 fcinfo.canonical_sample = fcinfo.samples[0];
693 fcinfo.canonical_dist = 0.0f;
694 for (
int i = 0; i < fcinfo.samples.size(); ++i) {
695 int s1 = fcinfo.samples[i];
697 f_table.
Set(features1, features1.
size(),
true);
698 double max_dist = 0.0;
703 for (
int j = 0; j < fcinfo.samples.size(); ++j) {
704 int s2 = fcinfo.samples[j];
705 if (samples_[s2]->class_id() != c ||
706 samples_[s2]->font_id() != font_id ||
711 int height = samples_[s2]->geo_feature(
GeoTop) -
713 if (dist == 1.0 && height > 64) {
719 if (dist > max_dist) {
721 if (dist > max_max_dist) {
729 f_table.
Set(features1, features1.
size(),
false);
730 samples_[s1]->set_max_dist(max_dist);
732 if (max_dist < min_max_dist) {
733 fcinfo.canonical_sample = s1;
734 fcinfo.canonical_dist = max_dist;
736 UpdateRange(max_dist, &min_max_dist, &max_max_dist);
738 if (max_max_dist > global_worst_dist) {
740 global_worst_dist = max_max_dist;
745 tprintf(
"Found %d samples of class %d=%s, font %d, "
746 "dist range [%g, %g], worst pair= %s, %s\n",
748 font_index, min_max_dist, max_max_dist,
755 tprintf(
"Global worst dist = %g, between sample %d and %d\n",
756 global_worst_dist, worst_s1, worst_s2);
757 Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
758 Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
759 pixOr(pix1, pix1, pix2);
760 pixWrite(
"worstpair.png", pix1, IFF_PNG);
774 for (
int font_index = 0; font_index < font_size; ++font_index) {
775 for (
int c = 0; c < unicharset_size_; ++c) {
776 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
777 int sample_count = fcinfo.samples.size();
778 int min_samples = 2 *
MAX(kSampleRandomSize, sample_count);
779 if (sample_count > 0 && sample_count < min_samples) {
780 int base_count = sample_count;
781 for (
int base_index = 0; sample_count < min_samples; ++sample_count) {
782 int src_index = fcinfo.samples[base_index++];
783 if (base_index >= base_count) base_index = 0;
785 sample_count % kSampleRandomSize);
786 int sample_index = samples_.
size();
789 fcinfo.samples.push_back(sample_index);
803 for (
int font_index = 0; font_index < font_size; ++font_index) {
805 for (
int c = 0; c < unicharset_size_; ++c) {
807 if (num_samples == 0)
810 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
821 for (
int font_index = 0; font_index < font_size; ++font_index) {
823 for (
int c = 0; c < unicharset_size_; ++c) {
825 if (num_samples == 0)
827 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
828 fcinfo.cloud_features.Init(feature_space_size);
832 for (
int i = 0; i < sample_features.
size(); ++i)
833 fcinfo.cloud_features.SetBit(sample_features[i]);
860 for (
int f = 0; f < indexed_features.
size(); ++
f) {
861 if (indexed_features[f] == f_index) {