Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TrainingSampleSet Class Reference

#include <trainingsampleset.h>

List of all members.

Classes

struct  FontClassDistance
struct  FontClassInfo

Public Member Functions

 TrainingSampleSet (const UnicityTable< FontInfo > &fontinfo_table)
 ~TrainingSampleSet ()
bool Serialize (FILE *fp) const
bool DeSerialize (bool swap, FILE *fp)
int num_samples () const
int num_raw_samples () const
int NumFonts () const
const UNICHARSETunicharset () const
int charsetsize () const
void LoadUnicharset (const char *filename)
int AddSample (const char *unichar, TrainingSample *sample)
void AddSample (int unichar_id, TrainingSample *sample)
int NumClassSamples (int font_id, int class_id, bool randomize) const
const TrainingSampleGetSample (int index) const
const TrainingSampleGetSample (int font_id, int class_id, int index) const
TrainingSampleMutableSample (int font_id, int class_id, int index)
STRING SampleToString (const TrainingSample &sample) const
const BitVectorGetCloudFeatures (int font_id, int class_id) const
const GenericVector< int > & GetCanonicalFeatures (int font_id, int class_id) const
float UnicharDistance (const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
float ClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
float ComputeClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
int ReliablySeparable (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
int GlobalSampleIndex (int font_id, int class_id, int index) const
const TrainingSampleGetCanonicalSample (int font_id, int class_id) const
float GetCanonicalDist (int font_id, int class_id) const
TrainingSamplemutable_sample (int index)
TrainingSampleextract_sample (int index)
void IndexFeatures (const IntFeatureSpace &feature_space)
void DeleteOutliers (const IntFeatureSpace &feature_space, bool debug)
void KillSample (TrainingSample *sample)
void DeleteDeadSamples ()
bool DeleteableSample (const TrainingSample *sample)
void OrganizeByFontAndClass ()
void SetupFontIdMap ()
void ComputeCanonicalSamples (const IntFeatureMap &map, bool debug)
void ReplicateAndRandomizeSamples ()
void ComputeCanonicalFeatures ()
void ComputeCloudFeatures (int feature_space_size)
void AddAllFontsForClass (int class_id, Shape *shape) const
void DisplaySamplesWithFeature (int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const

Detailed Description

Definition at line 43 of file trainingsampleset.h.


Constructor & Destructor Documentation

tesseract::TrainingSampleSet::TrainingSampleSet ( const UnicityTable< FontInfo > &  fontinfo_table)
explicit

Definition at line 70 of file trainingsampleset.cpp.

: num_raw_samples_(0), unicharset_size_(0),
font_class_array_(NULL), fontinfo_table_(font_table) {
}
tesseract::TrainingSampleSet::~TrainingSampleSet ( )

Definition at line 75 of file trainingsampleset.cpp.

{
delete font_class_array_;
}

Member Function Documentation

void tesseract::TrainingSampleSet::AddAllFontsForClass ( int  class_id,
Shape shape 
) const

Definition at line 840 of file trainingsampleset.cpp.

{
for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
int font_id = font_id_map_.CompactToSparse(f);
shape->AddToShape(class_id, font_id);
}
}
int tesseract::TrainingSampleSet::AddSample ( const char *  unichar,
TrainingSample sample 
)

Definition at line 130 of file trainingsampleset.cpp.

{
if (!unicharset_.contains_unichar(unichar)) {
unicharset_.unichar_insert(unichar);
if (unicharset_.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
"greater than MAX_NUM_CLASSES\n");
return -1;
}
}
UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
AddSample(char_id, sample);
return char_id;
}
void tesseract::TrainingSampleSet::AddSample ( int  unichar_id,
TrainingSample sample 
)

Definition at line 146 of file trainingsampleset.cpp.

{
sample->set_class_id(unichar_id);
samples_.push_back(sample);
num_raw_samples_ = samples_.size();
unicharset_size_ = unicharset_.size();
}
int tesseract::TrainingSampleSet::charsetsize ( ) const
inline

Definition at line 67 of file trainingsampleset.h.

{
return unicharset_size_;
}
float tesseract::TrainingSampleSet::ClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
)

Definition at line 297 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index1 = font_id_map_.SparseToCompact(font_id1);
int font_index2 = font_id_map_.SparseToCompact(font_id2);
if (font_index1 < 0 || font_index2 < 0)
return 0.0f;
FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
if (font_id1 == font_id2) {
// Special case cache for speed.
if (fc_info.unichar_distance_cache.size() == 0)
fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
if (fc_info.unichar_distance_cache[class_id2] < 0) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
fc_info.unichar_distance_cache[class_id2] = result;
// Copy to the symmetric cache entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
if (fc_info2.unichar_distance_cache.size() == 0)
fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
fc_info2.unichar_distance_cache[class_id1] = result;
}
return fc_info.unichar_distance_cache[class_id2];
} else if (class_id1 == class_id2) {
// Another special-case cache for equal class-id.
if (fc_info.font_distance_cache.size() == 0)
fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
-1.0f);
if (fc_info.font_distance_cache[font_index2] < 0) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
fc_info.font_distance_cache[font_index2] = result;
// Copy to the symmetric cache entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
if (fc_info2.font_distance_cache.size() == 0)
fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
-1.0f);
fc_info2.font_distance_cache[font_index1] = result;
}
return fc_info.font_distance_cache[font_index2];
}
// Both font and class are different. Linear search for class_id2/font_id2
// in what is a hopefully short list of distances.
int cache_index = 0;
while (cache_index < fc_info.distance_cache.size() &&
(fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
fc_info.distance_cache[cache_index].font_id != font_id2))
++cache_index;
if (cache_index == fc_info.distance_cache.size()) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
FontClassDistance fc_dist = { class_id2, font_id2, result };
fc_info.distance_cache.push_back(fc_dist);
// Copy to the symmetric cache entry. We know it isn't there already, as
// we always copy to the symmetric entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
fc_dist.unichar_id = class_id1;
fc_dist.font_id = font_id1;
fc_info2.distance_cache.push_back(fc_dist);
}
return fc_info.distance_cache[cache_index].distance;
}
void tesseract::TrainingSampleSet::ComputeCanonicalFeatures ( )

Definition at line 800 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int num_samples = NumClassSamples(font_id, c, false);
if (num_samples == 0)
continue;
const TrainingSample* sample = GetCanonicalSample(font_id, c);
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
fcinfo.canonical_features = sample->indexed_features();
}
}
}
void tesseract::TrainingSampleSet::ComputeCanonicalSamples ( const IntFeatureMap map,
bool  debug 
)

Definition at line 661 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
IntFeatureDist f_table;
if (debug) tprintf("feature table size %d\n", map.sparse_size());
f_table.Init(&map);
int worst_s1 = 0;
int worst_s2 = 0;
double global_worst_dist = 0.0;
// Compute distances independently for each font and char index.
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int samples_found = 0;
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
if (fcinfo.samples.size() == 0 ||
(kTestChar >= 0 && c != kTestChar)) {
fcinfo.canonical_sample = -1;
fcinfo.canonical_dist = 0.0f;
if (debug) tprintf("Skipping class %d\n", c);
continue;
}
// The canonical sample will be the one with the min_max_dist, which
// is the sample with the lowest maximum distance to all other samples.
double min_max_dist = 2.0;
// We keep track of the farthest apart pair (max_s1, max_s2) which
// are max_max_dist apart, so we can see how bad the variability is.
double max_max_dist = 0.0;
int max_s1 = 0;
int max_s2 = 0;
fcinfo.canonical_sample = fcinfo.samples[0];
fcinfo.canonical_dist = 0.0f;
for (int i = 0; i < fcinfo.samples.size(); ++i) {
int s1 = fcinfo.samples[i];
const GenericVector<int>& features1 = samples_[s1]->indexed_features();
f_table.Set(features1, features1.size(), true);
double max_dist = 0.0;
// Run the full squared-order search for similar samples. It is still
// reasonably fast because f_table.FeatureDistance is fast, but we
// may have to reconsider if we start playing with too many samples
// of a single char/font.
for (int j = 0; j < fcinfo.samples.size(); ++j) {
int s2 = fcinfo.samples[j];
if (samples_[s2]->class_id() != c ||
samples_[s2]->font_id() != font_id ||
s2 == s1)
continue;
GenericVector<int> features2 = samples_[s2]->indexed_features();
double dist = f_table.FeatureDistance(features2);
int height = samples_[s2]->geo_feature(GeoTop) -
samples_[s2]->geo_feature(GeoBottom);
if (dist == 1.0 && height > 64) {
// TODO(rays) rethink this when the polygonal approximation goes.
// Currently it is possible for dots and other small characters
// to be completely different, even within the same class.
f_table.DebugFeatureDistance(features2);
}
if (dist > max_dist) {
max_dist = dist;
if (dist > max_max_dist) {
max_s1 = s1;
max_s2 = s2;
}
}
}
// Using Set(..., false) is far faster than re initializing, due to
// the sparseness of the feature space.
f_table.Set(features1, features1.size(), false);
samples_[s1]->set_max_dist(max_dist);
++samples_found;
if (max_dist < min_max_dist) {
fcinfo.canonical_sample = s1;
fcinfo.canonical_dist = max_dist;
}
UpdateRange(max_dist, &min_max_dist, &max_max_dist);
}
if (max_max_dist > global_worst_dist) {
// Keep a record of the worst pair over all characters/fonts too.
global_worst_dist = max_max_dist;
worst_s1 = max_s1;
worst_s2 = max_s2;
}
if (debug) {
tprintf("Found %d samples of class %d=%s, font %d, "
"dist range [%g, %g], worst pair= %s, %s\n",
samples_found, c, unicharset_.debug_str(c).string(),
font_index, min_max_dist, max_max_dist,
SampleToString(*samples_[max_s1]).string(),
SampleToString(*samples_[max_s2]).string());
}
}
}
if (debug) {
tprintf("Global worst dist = %g, between sample %d and %d\n",
global_worst_dist, worst_s1, worst_s2);
Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
pixOr(pix1, pix1, pix2);
pixWrite("worstpair.png", pix1, IFF_PNG);
pixDestroy(&pix1);
pixDestroy(&pix2);
}
}
void tesseract::TrainingSampleSet::ComputeCloudFeatures ( int  feature_space_size)

Definition at line 818 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int num_samples = NumClassSamples(font_id, c, false);
if (num_samples == 0)
continue;
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
fcinfo.cloud_features.Init(feature_space_size);
for (int s = 0; s < num_samples; ++s) {
const TrainingSample* sample = GetSample(font_id, c, s);
const GenericVector<int>& sample_features = sample->indexed_features();
for (int i = 0; i < sample_features.size(); ++i)
fcinfo.cloud_features.SetBit(sample_features[i]);
}
}
}
}
float tesseract::TrainingSampleSet::ComputeClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
) const

Definition at line 368 of file trainingsampleset.cpp.

{
int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
feature_map, false);
dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
feature_map, false);
int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
denominator += GetCanonicalFeatures(font_id2, class_id2).size();
return static_cast<float>(dist) / denominator;
}
bool tesseract::TrainingSampleSet::DeleteableSample ( const TrainingSample sample)

Definition at line 582 of file trainingsampleset.cpp.

{
return sample == NULL || sample->class_id() < 0;
}
void tesseract::TrainingSampleSet::DeleteDeadSamples ( )

Definition at line 573 of file trainingsampleset.cpp.

{
samples_.compact(
num_raw_samples_ = samples_.size();
// Samples must be re-organized now we have deleted a few.
}
void tesseract::TrainingSampleSet::DeleteOutliers ( const IntFeatureSpace feature_space,
bool  debug 
)

Definition at line 493 of file trainingsampleset.cpp.

{
if (font_class_array_ == NULL)
Pixa* pixa = NULL;
if (debug)
pixa = pixaCreate(0);
GenericVector<int> feature_counts;
int fs_size = feature_space.Size();
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
for (int c = 0; c < unicharset_size_; ++c) {
// Create a histogram of the features used by all samples of this
// font/class combination.
feature_counts.init_to_size(fs_size, 0);
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
int sample_count = fcinfo.samples.size();
if (sample_count < kMinOutlierSamples)
continue;
for (int i = 0; i < sample_count; ++i) {
int s = fcinfo.samples[i];
const GenericVector<int>& features = samples_[s]->indexed_features();
for (int f = 0; f < features.size(); ++f) {
++feature_counts[features[f]];
}
}
for (int i = 0; i < sample_count; ++i) {
int s = fcinfo.samples[i];
const TrainingSample& sample = *samples_[s];
const GenericVector<int>& features = sample.indexed_features();
// A feature that has a histogram count of 1 is only used by this
// sample, making it 'bad'. All others are 'good'.
int good_features = 0;
int bad_features = 0;
for (int f = 0; f < features.size(); ++f) {
if (feature_counts[features[f]] > 1)
++good_features;
else
++bad_features;
}
// If more than 1/3 features are bad, then this is an outlier.
if (bad_features * 2 > good_features) {
tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
SampleToString(sample).string(),
good_features, bad_features);
if (debug) {
pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
// Add the previous sample as well, so it is easier to see in
// the output what is wrong with this sample.
int t;
if (i == 0)
t = fcinfo.samples[1];
else
t = fcinfo.samples[i - 1];
const TrainingSample &csample = *samples_[t];
pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
}
// Mark the sample for deletion.
KillSample(samples_[s]);
}
}
}
}
// Truly delete all bad samples and renumber everything.
if (pixa != NULL) {
Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
pixaDestroy(&pixa);
pixWrite("outliers.png", pix, IFF_PNG);
pixDestroy(&pix);
}
}
bool tesseract::TrainingSampleSet::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 94 of file trainingsampleset.cpp.

{
if (!samples_.DeSerialize(swap, fp)) return false;
num_raw_samples_ = samples_.size();
if (!unicharset_.load_from_file(fp)) return false;
if (!font_id_map_.DeSerialize(swap, fp)) return false;
if (font_class_array_ != NULL) {
delete font_class_array_;
font_class_array_ = NULL;
}
inT8 not_null;
if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
if (not_null) {
FontClassInfo empty;
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
}
unicharset_size_ = unicharset_.size();
return true;
}
void tesseract::TrainingSampleSet::DisplaySamplesWithFeature ( int  f_index,
const Shape shape,
const IntFeatureSpace feature_space,
ScrollView::Color  color,
ScrollView window 
) const

Definition at line 849 of file trainingsampleset.cpp.

{
for (int s = 0; s < num_raw_samples(); ++s) {
const TrainingSample* sample = GetSample(s);
if (shape.ContainsUnichar(sample->class_id())) {
GenericVector<int> indexed_features;
space.IndexAndSortFeatures(sample->features(), sample->num_features(),
&indexed_features);
for (int f = 0; f < indexed_features.size(); ++f) {
if (indexed_features[f] == f_index) {
sample->DisplayFeatures(color, window);
}
}
}
}
}
TrainingSample* tesseract::TrainingSampleSet::extract_sample ( int  index)
inline

Definition at line 162 of file trainingsampleset.h.

{
TrainingSample* sample = samples_[index];
samples_[index] = NULL;
return sample;
}
float tesseract::TrainingSampleSet::GetCanonicalDist ( int  font_id,
int  class_id 
) const

Definition at line 475 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return 0.0f;
if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
return (*font_class_array_)(font_index, class_id).canonical_dist;
else
return 0.0f;
}
const GenericVector< int > & tesseract::TrainingSampleSet::GetCanonicalFeatures ( int  font_id,
int  class_id 
) const

Definition at line 220 of file trainingsampleset.cpp.

{
int font_index = font_id_map_.SparseToCompact(font_id);
ASSERT_HOST(font_index >= 0);
return (*font_class_array_)(font_index, class_id).canonical_features;
}
const TrainingSample * tesseract::TrainingSampleSet::GetCanonicalSample ( int  font_id,
int  class_id 
) const

Definition at line 463 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index,
class_id).canonical_sample;
return sample_index >= 0 ? samples_[sample_index] : NULL;
}
const BitVector & tesseract::TrainingSampleSet::GetCloudFeatures ( int  font_id,
int  class_id 
) const

Definition at line 212 of file trainingsampleset.cpp.

{
int font_index = font_id_map_.SparseToCompact(font_id);
ASSERT_HOST(font_index >= 0);
return (*font_class_array_)(font_index, class_id).cloud_features;
}
const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  index) const

Definition at line 175 of file trainingsampleset.cpp.

{
return samples_[index];
}
const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 181 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
return samples_[sample_index];
}
int tesseract::TrainingSampleSet::GlobalSampleIndex ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 453 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return -1;
return (*font_class_array_)(font_index, class_id).samples[index];
}
void tesseract::TrainingSampleSet::IndexFeatures ( const IntFeatureSpace feature_space)

Definition at line 486 of file trainingsampleset.cpp.

{
for (int s = 0; s < samples_.size(); ++s)
samples_[s]->IndexFeatures(feature_space);
}
void tesseract::TrainingSampleSet::KillSample ( TrainingSample sample)

Definition at line 568 of file trainingsampleset.cpp.

{
sample->set_sample_index(-1);
}
void tesseract::TrainingSampleSet::LoadUnicharset ( const char *  filename)

Definition at line 115 of file trainingsampleset.cpp.

{
if (!unicharset_.load_from_file(filename)) {
tprintf("Failed to load unicharset from file %s\n"
"Building unicharset for boosting from scratch...\n",
unicharset_.clear();
// Space character needed to represent NIL_LIST classification.
unicharset_.unichar_insert(" ");
}
unicharset_size_ = unicharset_.size();
}
TrainingSample* tesseract::TrainingSampleSet::mutable_sample ( int  index)
inline

Definition at line 158 of file trainingsampleset.h.

{
return samples_[index];
}
TrainingSample * tesseract::TrainingSampleSet::MutableSample ( int  font_id,
int  class_id,
int  index 
)

Definition at line 192 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
return samples_[sample_index];
}
int tesseract::TrainingSampleSet::num_raw_samples ( ) const
inline

Definition at line 58 of file trainingsampleset.h.

{
return num_raw_samples_;
}
int tesseract::TrainingSampleSet::num_samples ( ) const
inline

Definition at line 55 of file trainingsampleset.h.

{
return samples_.size();
}
int tesseract::TrainingSampleSet::NumClassSamples ( int  font_id,
int  class_id,
bool  randomize 
) const

Definition at line 157 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
if (font_id < 0 || class_id < 0 ||
font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
// There are no samples because the font or class doesn't exist.
return 0;
}
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0)
return 0; // The font has no samples.
if (randomize)
return (*font_class_array_)(font_index, class_id).samples.size();
else
return (*font_class_array_)(font_index, class_id).num_raw_samples;
}
int tesseract::TrainingSampleSet::NumFonts ( ) const
inline

Definition at line 61 of file trainingsampleset.h.

{
return font_id_map_.SparseSize();
}
void tesseract::TrainingSampleSet::OrganizeByFontAndClass ( )

Definition at line 603 of file trainingsampleset.cpp.

{
// Font indexes are sparse, so we used a map to compact them, so we can
// have an efficient 2-d array of fonts and character classes.
int compact_font_size = font_id_map_.CompactSize();
// Get a 2-d array of generic vectors.
if (font_class_array_ != NULL)
delete font_class_array_;
FontClassInfo empty;
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
compact_font_size, unicharset_size_, empty);
for (int s = 0; s < samples_.size(); ++s) {
int font_id = samples_[s]->font_id();
int class_id = samples_[s]->class_id();
if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
s);
}
ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
int font_index = font_id_map_.SparseToCompact(font_id);
(*font_class_array_)(font_index, class_id).samples.push_back(s);
}
// Set the num_raw_samples member of the FontClassInfo, to set the boundary
// between the raw samples and the replicated ones.
for (int f = 0; f < compact_font_size; ++f) {
for (int c = 0; c < unicharset_size_; ++c)
(*font_class_array_)(f, c).num_raw_samples =
(*font_class_array_)(f, c).samples.size();
}
// This is the global number of samples and also marks the boundary between
// real and replicated samples.
num_raw_samples_ = samples_.size();
}
int tesseract::TrainingSampleSet::ReliablySeparable ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map,
bool  thorough 
) const

Definition at line 414 of file trainingsampleset.cpp.

{
int result = 0;
const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
if (sample2 == NULL)
return 0; // There are no canonical features.
const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
class_id2);
const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
if (cloud1.size() == 0)
return canonical2.size(); // There are no cloud features.
// Find a canonical2 feature that is not in cloud1.
for (int f = 0; f < canonical2.size(); ++f) {
int feature = canonical2[f];
if (cloud1[feature])
continue;
// Gather the near neighbours of f.
GenericVector<int> good_features;
AddNearFeatures(feature_map, feature, 1, &good_features);
// Check that none of the good_features are in the cloud.
int i;
for (i = 0; i < good_features.size(); ++i) {
int good_f = good_features[i];
if (cloud1[good_f]) {
break;
}
}
if (i < good_features.size())
continue; // Found one in the cloud.
++result;
}
return result;
}
void tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples ( )

Definition at line 771 of file trainingsampleset.cpp.

{
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
for (int c = 0; c < unicharset_size_; ++c) {
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
int sample_count = fcinfo.samples.size();
int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
if (sample_count > 0 && sample_count < min_samples) {
int base_count = sample_count;
for (int base_index = 0; sample_count < min_samples; ++sample_count) {
int src_index = fcinfo.samples[base_index++];
if (base_index >= base_count) base_index = 0;
TrainingSample* sample = samples_[src_index]->RandomizedCopy(
sample_count % kSampleRandomSize);
int sample_index = samples_.size();
sample->set_sample_index(sample_index);
samples_.push_back(sample);
fcinfo.samples.push_back(sample_index);
}
}
}
}
}
STRING tesseract::TrainingSampleSet::SampleToString ( const TrainingSample sample) const

Definition at line 203 of file trainingsampleset.cpp.

{
STRING boxfile_str;
MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
sample.bounding_box(), sample.page_num(), &boxfile_str);
return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
}
bool tesseract::TrainingSampleSet::Serialize ( FILE *  fp) const

Definition at line 80 of file trainingsampleset.cpp.

{
if (!samples_.Serialize(fp)) return false;
if (!unicharset_.save_to_file(fp)) return false;
if (!font_id_map_.Serialize(fp)) return false;
inT8 not_null = font_class_array_ != NULL;
if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
if (not_null) {
if (!font_class_array_->SerializeClasses(fp)) return false;
}
return true;
}
void tesseract::TrainingSampleSet::SetupFontIdMap ( )

Definition at line 641 of file trainingsampleset.cpp.

{
// Number of samples for each font_id.
GenericVector<int> font_counts;
for (int s = 0; s < samples_.size(); ++s) {
int font_id = samples_[s]->font_id();
while (font_id >= font_counts.size())
font_counts.push_back(0);
++font_counts[font_id];
}
font_id_map_.Init(font_counts.size(), false);
for (int f = 0; f < font_counts.size(); ++f) {
font_id_map_.SetMap(f, font_counts[f] > 0);
}
font_id_map_.Setup();
}
float tesseract::TrainingSampleSet::UnicharDistance ( const UnicharAndFonts uf1,
const UnicharAndFonts uf2,
bool  matched_fonts,
const IntFeatureMap feature_map 
)

Definition at line 231 of file trainingsampleset.cpp.

{
int num_fonts1 = uf1.font_ids.size();
int c1 = uf1.unichar_id;
int num_fonts2 = uf2.font_ids.size();
int c2 = uf2.unichar_id;
double dist_sum = 0.0;
int dist_count = 0;
bool debug = false;
if (matched_fonts) {
// Compute distances only where fonts match.
for (int i = 0; i < num_fonts1; ++i) {
int f1 = uf1.font_ids[i];
for (int j = 0; j < num_fonts2; ++j) {
int f2 = uf2.font_ids[j];
if (f1 == f2) {
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
++dist_count;
}
}
}
} else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
// Small enough sets to compute all the distances.
for (int i = 0; i < num_fonts1; ++i) {
int f1 = uf1.font_ids[i];
for (int j = 0; j < num_fonts2; ++j) {
int f2 = uf2.font_ids[j];
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
if (debug) {
tprintf("Cluster dist %d %d %d %d = %g\n",
f1, c1, f2, c2,
ClusterDistance(f1, c1, f2, c2, feature_map));
}
++dist_count;
}
}
} else {
// Subsample distances, using the largest set once, and stepping through
// the smaller set so as to ensure that all the pairs are different.
int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
int index = 0;
int num_samples = MAX(num_fonts1, num_fonts2);
for (int i = 0; i < num_samples; ++i, index += increment) {
int f1 = uf1.font_ids[i % num_fonts1];
int f2 = uf2.font_ids[index % num_fonts2];
if (debug) {
tprintf("Cluster dist %d %d %d %d = %g\n",
f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
}
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
++dist_count;
}
}
if (dist_count == 0) {
if (matched_fonts)
return UnicharDistance(uf1, uf2, false, feature_map);
return 0.0f;
}
return dist_sum / dist_count;
}
const UNICHARSET& tesseract::TrainingSampleSet::unicharset ( ) const
inline

Definition at line 64 of file trainingsampleset.h.

{
return unicharset_;
}

The documentation for this class was generated from the following files: