libeblearn
|
#include <datasource.h>
Public Types | |
typedef map< uint, idx< Tdata > > | t_pick_map |
Public Member Functions | |
datasource () | |
CAUTION: This empty constructor requires a subsequent call to init(). | |
datasource (midx< Tdata > &data, const char *name=NULL) | |
datasource (idx< Tdata > &data, const char *name=NULL) | |
datasource (const char *data_fname, const char *name=NULL) | |
virtual | ~datasource () |
destructor | |
void | init (midx< Tdata > &data, const char *name) |
Initialize from a multi-matrix data. | |
void | init (idx< Tdata > &data, const char *name) |
Initialize. | |
template<class Tstate > | |
void | fprop_data (mstate< Tstate > &s) |
Copies current sample's data into s. | |
virtual void | fprop_data (fstate_idx< Tnet > &s) |
Copies current sample's data into s. | |
virtual void | fprop_data (bbstate_idx< Tnet > &s) |
Copies current sample's data into s. | |
virtual void | fprop (bbstate_idx< Tnet > &s) |
Copies current sample's data into s. | |
virtual idx< Tdata > | get_sample (intg index) |
Return original sample's idx at this index. | |
virtual idx< Tnet > | get_raw_output (intg index=-1) |
virtual void | select_sample (intg index) |
virtual void | shuffle () |
virtual bool | next () |
virtual bool | next_train () |
virtual void | set_data_bias (Tnet bias) |
Set the bias to add to the data. | |
virtual void | set_data_coeff (Tnet coeff) |
Set the coefficient to multiply the data with. | |
virtual unsigned int | size () |
Returns the number of data instances contained in this data source. | |
virtual idxdim | sample_dims () |
virtual mfidxdim | sample_mfdims () |
virtual void | set_sample_energy (double e, bool correct, idx< Tnet > &raw_outputs, idx< Tnet > &answers, idx< Tnet > &target) |
virtual void | keep_outputs (bool keep=true) |
virtual void | normalize_all_probas () |
Normalize picking probabilities globally with maximum probability. | |
virtual void | normalize_probas (vector< intg > *cindinces=NULL) |
Normalize picking probabilities globally with maximum probability. | |
virtual void | seek_begin () |
virtual void | seek_begin_train () |
virtual void | set_shuffle_passes (bool activate) |
virtual void | set_weigh_samples (bool activate, bool hardest_focus=false, bool perclass_norm=true, double min_proba=0.0) |
virtual void | set_test () |
virtual bool | is_test () |
Returns true if this datasource is a test datasource only. | |
virtual intg | get_epoch_size () |
virtual intg | get_epoch_count () |
Return the number of samples this epoch has processed. | |
virtual void | set_epoch_size (intg sz) |
virtual void | set_epoch_mode (uint mode) |
virtual bool | epoch_done () |
virtual void | init_epoch () |
virtual void | save_pickings (const char *name=NULL) |
virtual bool | get_count_pickings () |
Return true if counting of pickings is enabled. | |
virtual void | set_count_pickings (bool count=true) |
Enable or disable the counting of pickings. | |
virtual string & | name () |
Return name of dataset. | |
virtual void | set_epoch_show (uint modulo) |
Print training count every module samples. | |
virtual void | ignore_correct (bool ignore=true) |
Do not train on correctly classified examples if ignore is true. | |
virtual bool | mstate_samples () |
Each sample contains multiple states or not. | |
virtual void | save_state () |
virtual void | restore_state () |
Restore previously saved internal iterators. | |
virtual void | pretty () |
Print info about the datasource on the standard output. | |
virtual void | pretty_progress (bool newline=true) |
Public Attributes | |
Tnet | bias |
Tnet | coeff |
idx< Tdata > | data |
midx< Tdata > | datas |
idx< double > | probas |
sample probabilities | |
idx< double > | energies |
sample energies | |
idx< ubyte > | correct |
sample correctness | |
idx< Tnet > | raw_outputs |
The raw outputs for each sample. | |
idx< Tnet > | answers |
all answers | |
idx< Tnet > | targets |
all targets | |
idx< uint > | pick_count |
count pickings. | |
bool | count_pickings |
Count which samples are picked. | |
bool | count_pickings_save |
unsigned int | height |
unsigned int | width |
string | _name |
Protected Member Functions | |
void | init2 (const char *name) |
Initialize. Should be called only after data matrix has been assigned. | |
virtual bool | pick_current () |
virtual map< uint, intg > & | get_pickings () |
Return a vector of sample indices, sorted by their picking counts. | |
Protected Attributes | |
vector< intg > | counts |
map< uint, intg > | picksmap |
bool | multimat |
True if data is a multi-matrix matrix. | |
bool | bkeep_outputs |
Keep model outputs for each sample. /////////////////////////////////////////////////////////////////////////. | |
intg | it |
Index of current sample in data matrix. | |
intg | it_test |
Current test index in data matrix. | |
intg | it_train |
Current train index in vector 'indices'. | |
idx< intg > | indices |
Vector of indices to the data matrix. /////////////////////////////////////////////////////////////////////////. | |
bool | state_saved |
State has been saved or not. | |
intg | it_saved |
Saving current iterator it. | |
intg | it_test_saved |
Saving current test iterator. | |
intg | it_train_saved |
Saving current train iterator. | |
idx< intg > | indices_saved |
Saving sample indices. | |
intg | epoch_cnt_saved |
intg | epoch_pick_cnt_saved |
# pickings | |
vector< intg > | epoch_done_counters_saved |
bool | shuffle_passes |
Shuffle at end of each pass. | |
bool | test_set |
This set is a test set or not. /////////////////////////////////////////////////////////////////////////. | |
vector< intg > | epoch_done_counters |
intg | epoch_sz |
intg | epoch_cnt |
intg | epoch_pick_cnt |
# pickings | |
uint | epoch_show |
intg | epoch_show_printed |
uint | epoch_mode |
0: fixed number, 1: all at least once | |
timer | epoch_timer |
timer | test_timer |
uint | not_picked |
bool | hardest_focus |
Focus training on hardest samples. | |
bool | _ignore_correct |
Do not train on correct samples. /////////////////////////////////////////////////////////////////////////. | |
bool | weigh_samples |
Use probas to pick samples. | |
bool | perclass_norm |
Normalize probas per class. | |
double | sample_min_proba |
Minimum proba of each sample. | |
idxdim | sampledims |
Dimensions of a data sample. | |
mfidxdim | samplemfdims |
Dimensions of a data sample. |
A class handling a data source. This datasource does can not contain labels, see 'labeled_datasource' or 'class_datasource' for regression and classification tasks.
ebl::datasource< Tnet, Tdata >::datasource | ( | midx< Tdata > & | data, |
const char * | name = NULL |
||
) |
Construct a datasource from 'data', where data is a multi-matrix matrix (midx type). This allows for dynamic loading of data and avoids the need to fit all data in memory. The first dimension of each data sample is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.
name | An optional name for this dataset. |
ebl::datasource< Tnet, Tdata >::datasource | ( | idx< Tdata > & | data, |
const char * | name = NULL |
||
) |
Construct a datasource from 'data'. The first dimension of 'data' is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.
name | An optional name for this dataset. |
ebl::datasource< Tnet, Tdata >::datasource | ( | const char * | data_fname, |
const char * | name = NULL |
||
) |
Construct a datasource from the matrix found in 'data_fname'. The first dimension of the data is expected to be the samples dimension, and the second one the feature dimension, or the channel dimension in case of images.
name | An optional name for this dataset. |
bool ebl::datasource< Tnet, Tdata >::epoch_done | ( | ) | [virtual] |
Return true if current epoch is finished. Call init_epoch() to restart a new epoch.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
intg ebl::datasource< Tnet, Tdata >::get_epoch_size | ( | ) | [virtual] |
Returns the number of samples to train on for one epoch. By default, it returns the size of the smallest class times the number of classes (see get_lowest_common_size()). Default behavior can be overriden with set_epoch_size().
idx< Tnet > ebl::datasource< Tnet, Tdata >::get_raw_output | ( | intg | index = -1 | ) | [virtual] |
Returns an idx of the last raw output of current sample. This supposes raw outputs have been previously initialized via set_sample_energy().
index | If -1, return current sample's, otherwise sample's at 'index' position. |
void ebl::datasource< Tnet, Tdata >::init_epoch | ( | ) | [virtual] |
Restarts a new epoch, i.e. resets counters but do not reset iterators positions.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::keep_outputs | ( | bool | keep = true | ) | [virtual] |
If 'keep' is true, then we keep for each sample the 'raw_outputs', the 'answers' and the 'target' of the model (see set_sample_energy()). This may be expensive in memory.
bool ebl::datasource< Tnet, Tdata >::next | ( | ) | [virtual] |
Move to the next datum (in the original order of the dataset). Returns false if we reached the end. This should be used during testing. It will always return the data in the same order with the same probability of 1. See next_train() for data returned with variable probability, balance, etc. (used for training only).
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::labeled_pair_datasource< Tnet, Tdata, Tlabel >.
bool ebl::datasource< Tnet, Tdata >::next_train | ( | ) | [virtual] |
Move to the next datum, in a way suited for training (_not_ for testing, for testing see next()): depending on the configuration, this will return samples in a class-balanced way, i.e. showing each class sequentially, with different probabilities based on sample's difficulty, or/and in a random order after each pass. When all samples of a class have been shown, it loops back to the first sample of that class. This should be used during training only. If a sample was not selected because of a low probability, this will return false, if it was selected it returns true. In any case, internal iterators will always be set to the next sample, regardless if it was selected or not. It is up to the caller, to train on the sample if selected, or only test and update its energy if not selected.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::hierarchy_datasource< Tnet, Tdata, Tlabel >.
bool ebl::datasource< Tnet, Tdata >::pick_current | ( | ) | [protected, virtual] |
Draw a random number between 0 and 1 and return true if higher than current sample's probability.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::pretty_progress | ( | bool | newline = true | ) | [virtual] |
Pretty the progress of current epoch.
newline | If true, end pretty with a new line. |
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
idxdim ebl::datasource< Tnet, Tdata >::sample_dims | ( | ) | [virtual] |
Returns an idxdim object describing the order (number of dimensions) and the size of each dimension of a single sample outputed by fprop.
mfidxdim ebl::datasource< Tnet, Tdata >::sample_mfdims | ( | ) | [virtual] |
Returns an idxdim object describing the order (number of dimensions) and the size of each dimension of a single sample outputed by fprop.
void ebl::datasource< Tnet, Tdata >::save_pickings | ( | const char * | name = NULL | ) | [virtual] |
Output statistics of samples picking, i.e. the number of times each sample has been picked for training.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::save_state | ( | ) | [virtual] |
Save internal iterators. Calling restore_state() will return to the current sample.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::seek_begin | ( | ) | [virtual] |
Move to the beginning of the data, for the test iterators only, i.e. only next() is affected, next_train() is unaffected.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >, and ebl::labeled_pair_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::seek_begin_train | ( | ) | [virtual] |
Move to the beginning of the data, for the train iterators only, i.e. only next_train() is affected, next() is unaffected.
Reimplemented in ebl::class_datasource< Tnet, Tdata, Tlabel >.
void ebl::datasource< Tnet, Tdata >::select_sample | ( | intg | index | ) | [virtual] |
Set the internal iterators such that a call to fprop() will return the data associated with this index.
void ebl::datasource< Tnet, Tdata >::set_epoch_mode | ( | uint | mode | ) | [virtual] |
Set the epoch mode, i.e. how samples are presented for training. 0: show a fixed number of samples (set by set_epoch_size()). 1: show all samples at least once (samples may be shown multiple times if a class is unbalanced and the balance mode is activated).
void ebl::datasource< Tnet, Tdata >::set_epoch_size | ( | intg | sz | ) | [virtual] |
Set the number of samples to train on for one epoch. If not called, default number used is the one returned by get_lowest_common_size().
void ebl::datasource< Tnet, Tdata >::set_sample_energy | ( | double | e, |
bool | correct, | ||
idx< Tnet > & | raw_outputs, | ||
idx< Tnet > & | answers, | ||
idx< Tnet > & | target | ||
) | [virtual] |
Set the distance (or energy) between the answer of the model to train and the true answer. This is used to give more or less probability for a sample to be used for training. At the beginning of training, all samples start with a probability of 1, thus all samples are used, but as training goes, easy samples are given lower probability while harder samples are given higher probability. The absolute distance is directly mapped into a probability, i.e. a distance of 0 will give probability 0 to be used, and distance of 1 and higher give probability 1 to be used. Therefore distance should be normalized so that a distance of 1 represents an offending answer. This is used only by next_train(), not by next().
correct | True if the answer was correct, false otherwise. This is used to ignore samples for which we already get the correct answer. |
raw_outputs | The raw outputs of the network for current sample. |
answers | All the estimated answers for that sample (class, confidence, etc). |
target | The target answer, i.e. the groundtruth equivalent of 'estimates'. |
void ebl::datasource< Tnet, Tdata >::set_shuffle_passes | ( | bool | activate | ) | [virtual] |
Activate or deactivate shuffling of list of samples for each class after reaching the end of the sample list. This has an effect only when set_balanced() is set. This is activated by default. This is used only by next_train(), not by next().
void ebl::datasource< Tnet, Tdata >::set_test | ( | ) | [virtual] |
Set this datasource to be a test datasource. This is optional but useful for reporting and to verify that no training only methods are called on this datasrouces (e.g. next_train()).
void ebl::datasource< Tnet, Tdata >::set_weigh_samples | ( | bool | activate, |
bool | hardest_focus = false , |
||
bool | perclass_norm = true , |
||
double | min_proba = 0.0 |
||
) | [virtual] |
Activate or deactivate weighing of samples based on classification results. Wrong answers give a higher probability for a sample to be used for training, correct answers a lower probability. This is activated by default. This is used only by next_train(), not by next().
hardest_focus | If true, focus on hardest samples, otherwise focus on easiest ones. |
perclass_norm | Set the normalization of the sample probabilities to be per class or global. If perclass is true, the probabilities for each sample of a same class are normalized from [0..max_class] to [0..1] otherwise from [0..max_global]. Perclass can be used (or not) for discrete classification problems, but should be global for continuous labels. This normalization avoids looping on samples rarely picking any if all probabilities tend to zero for example. This is used only by next_train(), not by next(). |
min_proba | Set the minimum probaility of a sample to be picked by next_train(). By default, this is zero. Acceptable range is [0 .. 1]. |
void ebl::datasource< Tnet, Tdata >::shuffle | ( | ) | [virtual] |
shuffle dataset, based on the number of classes assume the same nb of samples in each class