libeblearn
ebl::datasource< Tnet, Tdata > Class Template Reference

#include <datasource.h>

Inheritance diagram for ebl::datasource< Tnet, Tdata >:
ebl::labeled_datasource< Tnet, Tdata, Tlabel > ebl::class_datasource< Tnet, Tdata, Tlabel > ebl::labeled_pair_datasource< Tnet, Tdata, Tlabel > ebl::hierarchy_datasource< Tnet, Tdata, Tlabel > ebl::mnist_datasource< Tnet, Tdata, Tlabel >

List of all members.

Public Types

typedef map< uint, idx< Tdata > > t_pick_map

Public Member Functions

 datasource ()
 CAUTION: This empty constructor requires a subsequent call to init().
 datasource (midx< Tdata > &data, const char *name=NULL)
 datasource (idx< Tdata > &data, const char *name=NULL)
 datasource (const char *data_fname, const char *name=NULL)
virtual ~datasource ()
 destructor
void init (midx< Tdata > &data, const char *name)
 Initialize from a multi-matrix data.
void init (idx< Tdata > &data, const char *name)
 Initialize.
template<class Tstate >
void fprop_data (mstate< Tstate > &s)
 Copies current sample's data into s.
virtual void fprop_data (fstate_idx< Tnet > &s)
 Copies current sample's data into s.
virtual void fprop_data (bbstate_idx< Tnet > &s)
 Copies current sample's data into s.
virtual void fprop (bbstate_idx< Tnet > &s)
 Copies current sample's data into s.
virtual idx< Tdata > get_sample (intg index)
 Return original sample's idx at this index.
virtual idx< Tnet > get_raw_output (intg index=-1)
virtual void select_sample (intg index)
virtual void shuffle ()
virtual bool next ()
virtual bool next_train ()
virtual void set_data_bias (Tnet bias)
 Set the bias to add to the data.
virtual void set_data_coeff (Tnet coeff)
 Set the coefficient to multiply the data with.
virtual unsigned int size ()
 Returns the number of data instances contained in this data source.
virtual idxdim sample_dims ()
virtual mfidxdim sample_mfdims ()
virtual void set_sample_energy (double e, bool correct, idx< Tnet > &raw_outputs, idx< Tnet > &answers, idx< Tnet > &target)
virtual void keep_outputs (bool keep=true)
virtual void normalize_all_probas ()
 Normalize picking probabilities globally with maximum probability.
virtual void normalize_probas (vector< intg > *cindinces=NULL)
 Normalize picking probabilities globally with maximum probability.
virtual void seek_begin ()
virtual void seek_begin_train ()
virtual void set_shuffle_passes (bool activate)
virtual void set_weigh_samples (bool activate, bool hardest_focus=false, bool perclass_norm=true, double min_proba=0.0)
virtual void set_test ()
virtual bool is_test ()
 Returns true if this datasource is a test datasource only.
virtual intg get_epoch_size ()
virtual intg get_epoch_count ()
 Return the number of samples this epoch has processed.
virtual void set_epoch_size (intg sz)
virtual void set_epoch_mode (uint mode)
virtual bool epoch_done ()
virtual void init_epoch ()
virtual void save_pickings (const char *name=NULL)
virtual bool get_count_pickings ()
 Return true if counting of pickings is enabled.
virtual void set_count_pickings (bool count=true)
 Enable or disable the counting of pickings.
virtual string & name ()
 Return name of dataset.
virtual void set_epoch_show (uint modulo)
 Print training count every module samples.
virtual void ignore_correct (bool ignore=true)
 Do not train on correctly classified examples if ignore is true.
virtual bool mstate_samples ()
 Each sample contains multiple states or not.
virtual void save_state ()
virtual void restore_state ()
 Restore previously saved internal iterators.
virtual void pretty ()
 Print info about the datasource on the standard output.
virtual void pretty_progress (bool newline=true)

Public Attributes

Tnet bias
Tnet coeff
idx< Tdata > data
midx< Tdata > datas
idx< double > probas
 sample probabilities
idx< double > energies
 sample energies
idx< ubyte > correct
 sample correctness
idx< Tnet > raw_outputs
 The raw outputs for each sample.
idx< Tnet > answers
 all answers
idx< Tnet > targets
 all targets
idx< uint > pick_count
 count pickings.
bool count_pickings
 Count which samples are picked.
bool count_pickings_save
unsigned int height
unsigned int width
string _name

Protected Member Functions

void init2 (const char *name)
 Initialize. Should be called only after data matrix has been assigned.
virtual bool pick_current ()
virtual map< uint, intg > & get_pickings ()
 Return a vector of sample indices, sorted by their picking counts.

Protected Attributes

vector< intg > counts
map< uint, intg > picksmap
bool multimat
 True if data is a multi-matrix matrix.
bool bkeep_outputs
 Keep model outputs for each sample. /////////////////////////////////////////////////////////////////////////.
intg it
 Index of current sample in data matrix.
intg it_test
 Current test index in data matrix.
intg it_train
 Current train index in vector 'indices'.
idx< intg > indices
 Vector of indices to the data matrix. /////////////////////////////////////////////////////////////////////////.
bool state_saved
 State has been saved or not.
intg it_saved
 Saving current iterator it.
intg it_test_saved
 Saving current test iterator.
intg it_train_saved
 Saving current train iterator.
idx< intg > indices_saved
 Saving sample indices.
intg epoch_cnt_saved
intg epoch_pick_cnt_saved
 # pickings
vector< intg > epoch_done_counters_saved
bool shuffle_passes
 Shuffle at end of each pass.
bool test_set
 This set is a test set or not. /////////////////////////////////////////////////////////////////////////.
vector< intg > epoch_done_counters
intg epoch_sz
intg epoch_cnt
intg epoch_pick_cnt
 # pickings
uint epoch_show
intg epoch_show_printed
uint epoch_mode
 0: fixed number, 1: all at least once
timer epoch_timer
timer test_timer
uint not_picked
bool hardest_focus
 Focus training on hardest samples.
bool _ignore_correct
 Do not train on correct samples. /////////////////////////////////////////////////////////////////////////.
bool weigh_samples
 Use probas to pick samples.
bool perclass_norm
 Normalize probas per class.
double sample_min_proba
 Minimum proba of each sample.
idxdim sampledims
 Dimensions of a data sample.
mfidxdim samplemfdims
 Dimensions of a data sample.

Detailed Description

template<typename Tnet, typename Tdata>
class ebl::datasource< Tnet, Tdata >

A class handling a data source. This datasource does can not contain labels, see 'labeled_datasource' or 'class_datasource' for regression and classification tasks.


Constructor & Destructor Documentation

template<typename Tnet , typename Tdata >
ebl::datasource< Tnet, Tdata >::datasource ( midx< Tdata > &  data,
const char *  name = NULL 
)

Construct a datasource from 'data', where data is a multi-matrix matrix (midx type). This allows for dynamic loading of data and avoids the need to fit all data in memory. The first dimension of each data sample is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.

Parameters:
nameAn optional name for this dataset.
template<typename Tnet , typename Tdata >
ebl::datasource< Tnet, Tdata >::datasource ( idx< Tdata > &  data,
const char *  name = NULL 
)

Construct a datasource from 'data'. The first dimension of 'data' is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.

Parameters:
nameAn optional name for this dataset.
template<typename Tnet , typename Tdata >
ebl::datasource< Tnet, Tdata >::datasource ( const char *  data_fname,
const char *  name = NULL 
)

Construct a datasource from the matrix found in 'data_fname'. The first dimension of the data is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.

Parameters:
nameAn optional name for this dataset.

Member Function Documentation

template<typename Tnet , typename Tdata >
bool ebl::datasource< Tnet, Tdata >::epoch_done ( ) [virtual]

Return true if current epoch is finished. Call init_epoch() to restart a new epoch.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
intg ebl::datasource< Tnet, Tdata >::get_epoch_size ( ) [virtual]

Returns the number of samples to train on for one epoch. By default, it returns the size of the smallest class times the number of classes (see get_lowest_common_size()). Default behavior can be overriden with set_epoch_size().

template<typename Tnet , typename Tdata >
idx< Tnet > ebl::datasource< Tnet, Tdata >::get_raw_output ( intg  index = -1) [virtual]

Returns an idx of the last raw output of current sample. This supposes raw outputs have been previously initialized via set_sample_energy().

Parameters:
indexIf -1, return current sample's, otherwise sample's at 'index' position.
template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::init_epoch ( ) [virtual]

Restarts a new epoch, i.e. resets counters but do not reset iterators positions.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::keep_outputs ( bool  keep = true) [virtual]

If 'keep' is true, then we keep for each sample the 'raw_outputs', the 'answers' and the 'target' of the model (see set_sample_energy()). This may be expensive in memory.

template<typename Tnet , typename Tdata >
bool ebl::datasource< Tnet, Tdata >::next ( ) [virtual]

Move to the next datum (in the original order of the dataset). Returns false if we reached the end. This should be used during testing. It will always return the data in the same order with the same probability of 1. See next_train() for data returned with variable probability, balance, etc. (used for training only).

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::labeled_pair_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
bool ebl::datasource< Tnet, Tdata >::next_train ( ) [virtual]

Move to the next datum, in a way suited for training (_not_ for testing, for testing see next()): depending on the configuration, this will return samples in a class-balanced way, i.e. showing each class sequentially, with different probabilities based on sample's difficulty, or/and in a random order after each pass. When all samples of a class have been shown, it loops back to the first sample of that class. This should be used during training only. If a sample was not selected because of a low probability, this will return false, if it was selected it returns true. In any case, internal iterators will always be set to the next sample, regardless if it was selected or not. It is up to the caller, to train on the sample if selected, or only test and update its energy if not selected.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::hierarchy_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
bool ebl::datasource< Tnet, Tdata >::pick_current ( ) [protected, virtual]

Draw a random number between 0 and 1 and return true if higher than current sample's probability.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::pretty_progress ( bool  newline = true) [virtual]

Pretty the progress of current epoch.

Parameters:
newlineIf true, end pretty with a new line.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
idxdim ebl::datasource< Tnet, Tdata >::sample_dims ( ) [virtual]

Returns an idxdim object describing the order (number of dimensions) and the size of each dimension of a single sample outputed by fprop.

template<typename Tnet , typename Tdata >
mfidxdim ebl::datasource< Tnet, Tdata >::sample_mfdims ( ) [virtual]

Returns an idxdim object describing the order (number of dimensions) and the size of each dimension of a single sample outputed by fprop.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::save_pickings ( const char *  name = NULL) [virtual]

Output statistics of samples picking, i.e. the number of times each sample has been picked for training.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::save_state ( ) [virtual]

Save internal iterators. Calling restore_state() will return to the current sample.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::seek_begin ( ) [virtual]

Move to the beginning of the data, for the test iterators only, i.e. only next() is affected, next_train() is unaffected.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::labeled_pair_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::seek_begin_train ( ) [virtual]

Move to the beginning of the data, for the train iterators only, i.e. only next_train() is affected, next() is unaffected.

Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::select_sample ( intg  index) [virtual]

Set the internal iterators such that a call to fprop() will return the data associated with this index.

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_epoch_mode ( uint  mode) [virtual]

Set the epoch mode, i.e. how samples are presented for training. 0: show a fixed number of samples (set by set_epoch_size()). 1: show all samples at least once (samples may be shown multiple times if a class is unbalanced and the balance mode is activated).

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_epoch_size ( intg  sz) [virtual]

Set the number of samples to train on for one epoch. If not called, default number used is the one returned by get_lowest_common_size().

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_sample_energy ( double  e,
bool  correct,
idx< Tnet > &  raw_outputs,
idx< Tnet > &  answers,
idx< Tnet > &  target 
) [virtual]

Set the distance (or energy) between the answer of the model to train and the true answer. This is used to give more or less probability for a sample to be used for training. At the beginning of training, all samples start with a probability of 1, thus all samples are used, but as training goes, easy samples are given lower probability while harder samples are given higher probability. The absolute distance is directly mapped into a probability, i.e. a distance of 0 will give probability 0 to be used, and distance of 1 and higher give probability 1 to be used. Therefore distance should be normalized so that a distance of 1 represents an offending answer. This is used only by next_train(), not by next().

Parameters:
correctTrue if the answer was correct, false otherwise. This is used to ignore samples for which we already get the correct answer.
raw_outputsThe raw outputs of the network for current sample.
answersAll the estimated answers for that sample (class, confidence, etc).
targetThe target answer, i.e. the groundtruth equivalent of 'estimates'.
template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_shuffle_passes ( bool  activate) [virtual]

Activate or deactivate shuffling of list of samples for each class after reaching the end of the sample list. This has an effect only when set_balanced() is set. This is activated by default. This is used only by next_train(), not by next().

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_test ( ) [virtual]

Set this datasource to be a test datasource. This is optional but useful for reporting and to verify that no training only methods are called on this datasrouces (e.g. next_train()).

template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::set_weigh_samples ( bool  activate,
bool  hardest_focus = false,
bool  perclass_norm = true,
double  min_proba = 0.0 
) [virtual]

Activate or deactivate weighing of samples based on classification results. Wrong answers give a higher probability for a sample to be used for training, correct answers a lower probability. This is activated by default. This is used only by next_train(), not by next().

Parameters:
hardest_focusIf true, focus on hardest samples, otherwise focus on easiest ones.
perclass_normSet the normalization of the sample probabilities to be per class or global. If perclass is true, the probabilities for each sample of a same class are normalized from [0..max_class] to [0..1] otherwise from [0..max_global]. Perclass can be used (or not) for discrete classification problems, but should be global for continuous labels. This normalization avoids looping on samples rarely picking any if all probabilities tend to zero for example. This is used only by next_train(), not by next().
min_probaSet the minimum probaility of a sample to be picked by next_train(). By default, this is zero. Acceptable range is [0 .. 1].
template<typename Tnet , typename Tdata >
void ebl::datasource< Tnet, Tdata >::shuffle ( ) [virtual]

shuffle dataset, based on the number of classes assume the same nb of samples in each class


The documentation for this class was generated from the following files: