documentation/libeblearn_html/detector_8hpp_source.html

00001 /***************************************************************************
00002  *   Copyright (C) 2010 by Pierre Sermanet *
00003  *   pierre.sermanet@gmail.com *
00004  *   All rights reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions are met:
00008  *     * Redistributions of source code must retain the above copyright
00009  *       notice, this list of conditions and the following disclaimer.
00010  *     * Redistributions in binary form must reproduce the above copyright
00011  *       notice, this list of conditions and the following disclaimer in the
00012  *       documentation and/or other materials provided with the distribution.
00013  *     * Redistribution under a license not approved by the Open Source
00014  *       Initiative (http://www.opensource.org) must display the
00015  *       following acknowledgement in all advertising material:
00016  *        This product includes software developed at the Courant
00017  *        Institute of Mathematical Sciences (http://cims.nyu.edu).
00018  *     * The names of the authors may not be used to endorse or promote products
00019  *       derived from this software without specific prior written permission.
00020  *
00021  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
00022  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00023  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00024  * DISCLAIMED. IN NO EVENT SHALL ThE AUTHORS BE LIABLE FOR ANY
00025  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00026  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00027  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00028  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00029  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00030  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031  ***************************************************************************/
00032
00033 #ifndef DETECTOR_HPP
00034 #define DETECTOR_HPP
00035
00036 #include "numerics.h"
00037
00038 #ifndef __NOSTL__
00039 #include <algorithm>
00040 #include <typeinfo>
00041 #include <iomanip>
00042 #endif
00043
00044 using namespace std;
00045
00046 namespace ebl {
00047
00048   template <typename T, class Tstate>
00049   detector<T,Tstate>::
00050   detector(module_1_1<T,Tstate> &thenet_, vector<string> &labels_,
00051            answer_module<T,T,T,Tstate> *answer_,
00052            resizepp_module<T,Tstate> *resize, const char *background,
00053            std::ostream &o, std::ostream &e,
00054            bool adapt_scales_)
00055     : thenet(thenet_), resizepp(resize), resizepp_delete(false),
00056       input(NULL), minput(NULL), netdim_fixed(false),
00057       bgclass(-1), mask_class(-1), pnms(NULL), scales_step(0), min_scale(1.0),
00058       max_scale(1.0), restype(ORIGINAL), silent(false), save_mode(false),
00059       save_dir(""), save_counts(labels_.size(), 0), min_size(0), max_size(0),
00060       bodetections(false), bppdetections(false), mem_optimization(false),
00061       optimization_swap(false), keep_inputs(true), hzpad(0), wzpad(0),
00062       mout(o), merr(e), smoothing_type(0), initialized(false),
00063       bboxes_off(false), adapt_scales(adapt_scales_), answer(answer_),
00064       ignore_outsiders(false), corners_inference(0), corners_infered(false),
00065       pre_threshold(0), bbox_decision(0) {
00066     // // make sure the top module is an answer module
00067     // module_1_1<T,Tstate> *last = thenet.last_module();
00068     // if (!dynamic_cast<answer_module<T,Tstate>*>(last))
00069     //   eblerror("expected last module to be of type answer_module but found: "
00070     //         << last->name());
00071     scaler_mode = false;
00072     if (answer && (dynamic_cast<scaler_answer<T,T,T,Tstate>*>(answer) ||
00073                    dynamic_cast<scalerclass_answer<T,T,T,Tstate>*>(answer)))
00074       scaler_mode = true;
00075     if (answer) mout << "Using answer module: " << answer->describe() << endl;
00076     // look for resize module in network
00077     if (!resizepp) {
00078       resizepp = arch_find(&thenet, resizepp);
00079       if (resizepp) mout << "Found a resizepp module in network: "
00080                          << resizepp->describe() << endl;
00081       else mout << "No resizepp module found in network." << endl;
00082     }
00083     // set default resizing module
00084     if (!resizepp) {
00085       resizepp = new resizepp_module<T,Tstate>;
00086       cout << "Using default resizing module: " << resizepp->describe() << endl;
00087       resizepp_delete = true;
00088     }
00089     labels = labels_;
00090     mout << "Classes labels: " << labels << endl;
00091 // #ifdef __ANDROID__ // TODO: temporary
00092 //     bgclass = 0;
00093 // #else
00094     //#endif
00095     // initilizations
00096     save_max_per_frame = limits<uint>::max();
00097     diverse_ordering = false;
00098     // set outpout streams of network
00099     thenet.set_output_streams(o, e);
00100     update_merge_alignment();
00101   }
00102
00103   template <typename T, class Tstate>
00104   detector<T,Tstate>::~detector() {
00105     if (resizepp_delete && resizepp) delete resizepp;
00106     if (pnms) delete pnms;
00107   }
00108
00109   template <typename T, class Tstate>
00110   void detector<T,Tstate>::set_scaling_original() {
00111     nscales = 1;
00112     restype = ORIGINAL;
00113   }
00114
00115   template <typename T, class Tstate>
00116   void detector<T,Tstate>::set_scaling_type(t_scaling type) {
00117     restype = type;
00118     mout << "Setting scaling to type " << type << " (";
00119     switch (restype) {
00120     case MANUAL: mout << "MANUAL"; break ;
00121     case SCALES: mout << "SCALES"; break ;
00122     case NSCALES: mout << "NSCALES"; break ;
00123     case SCALES_STEP: mout << "SCALES_STEP"; break ;
00124     case ORIGINAL: mout << "ORIGINAL"; break ;
00125     case NETWORK: mout << "NETWORK"; break ;
00126     case SCALES_STEP_UP: mout << "SCALES_STEP_UP"; break ;
00127     default:
00128       eblerror("unknown type");
00129     }
00130     mout << ")" << endl;
00131   }
00132
00133   template <typename T, class Tstate>
00134   void detector<T,Tstate>::set_resolutions(const midxdim &scales_) {
00135     restype = MANUAL;
00136     manual_scales = scales_;
00137     if (manual_scales.size() == 0)
00138       eblerror("expected at least 1 scale but found 0");
00139     // add the feature dimension for each scale
00140     for (uint i = 0; i < manual_scales.size(); ++i) {
00141       idxdim &d = manual_scales[i];
00142       d.insert_dim(0, 1);
00143     }
00144   }
00145
00146   template <typename T, class Tstate>
00147   void detector<T,Tstate>::set_resolutions(const vector<double> &factors) {
00148     restype = SCALES;
00149     scale_factors = factors;
00150   }
00151
00152   template <typename T, class Tstate>
00153   void detector<T,Tstate>::set_resolution(double factor) {
00154     restype = SCALES;
00155     scale_factors.clear();
00156     scale_factors.push_back(factor);
00157   }
00158
00159   template <typename T, class Tstate>
00160   void detector<T,Tstate>::set_resolutions(int nscales_) {
00161     nscales = (uint) nscales_;
00162     restype = NSCALES;
00163   }
00164
00165   template <typename T, class Tstate>
00166   void detector<T,Tstate>::set_resolutions(double scales_step_,
00167                                            double max_scale_,
00168                                            double min_scale_) {
00169     restype = SCALES_STEP;
00170     scales_step = scales_step_;
00171     max_scale = max_scale_;
00172     min_scale = min_scale_;
00173     mout << "Multi resolution scales: step factor " << scales_step
00174          << ", min/max resolution factor " << min_scale << ", " << max_scale
00175          << endl;
00176   }
00177
00178   template <typename T, class Tstate>
00179   void detector<T,Tstate>::set_zpads(float hzpad_, float wzpad_) {
00180     if (hzpad_ != 0 || wzpad_ != 0) {
00181       if (!netdim_fixed) {
00182         fidxdim minodim(1, 1, 1); // min output dims
00183         netdim = thenet.bprop_size(minodim); // compute min input dims
00184       }
00185       hzpad = (uint) (hzpad_ * netdim.dim(1));
00186       wzpad = (uint) (wzpad_ * netdim.dim(2));
00187       resizepp->set_zpads(hzpad, wzpad);
00188       mout << "Adding zero padding on input (on each side): hpad: "
00189            << hzpad << " wpad: " << wzpad << endl;
00190       if (hzpad_ > 1 || wzpad_ > 1)
00191         eblerror("zero padding coeff should be in [0 1] range");
00192     }
00193   }
00194
00195   template <typename T, class Tstate>
00196   int detector<T,Tstate>::get_class_id(const string &name) {
00197     for (uint i = 0; i < labels.size(); ++i)
00198       if (!strcmp(labels[i].c_str(), name.c_str()))
00199         return i;
00200     return -1;
00201   }
00202
00203   template <typename T, class Tstate>
00204   void detector<T,Tstate>::set_bgclass(const char *bg) {
00205     string name;
00206
00207     if (bg)
00208       name = bg;
00209     else
00210       name = "bg"; // default name
00211     bgclass = get_class_id(name);
00212     if (bgclass != -1) {
00213       mout << "Background class is \"" << name << "\" with id " << bgclass;
00214       mout << "." << endl;
00215     } else if (bg)
00216       merr << "warning: background class \"" << bg << "\" not found." << endl;
00217   }
00218
00219   // TODO: handle more than 1 class
00220   template <typename T, class Tstate>
00221   bool detector<T,Tstate>::set_mask_class(const char *mask) {
00222     string name;
00223
00224     if (!mask)
00225       return false;
00226     name = mask;
00227     mask_class = get_class_id(name);
00228     if (mask_class != -1) {
00229       mout << "Mask class is \"" << name << "\" with id " << mask_class;
00230       mout << "." << endl;
00231       return true;
00232     }
00233     merr << "warning: mask class \"" << mask << "\" not found." << endl;
00234     return false;
00235   }
00236
00237   template <typename T, class Tstate>
00238   void detector<T,Tstate>::set_silent() {
00239     silent = true;
00240   }
00241
00242   template <typename T, class Tstate>
00243   void detector<T,Tstate>::set_max_resolution(uint max_size_) {
00244     uint mzpad = std::max(hzpad * 2, wzpad * 2);
00245     max_size = max_size_ + mzpad;
00246     mout << "Setting maximum input size to " << max_size_ << "x"
00247          << max_size_ << " (add twice max(hzpad,wzpad): " << mzpad
00248          << ")" << endl;
00249   }
00250
00251   template <typename T, class Tstate>
00252   void detector<T,Tstate>::set_min_resolution(uint min_size_) {
00253     mout << "Setting minimum input size to " << min_size_ << "x"
00254          << min_size_ << "." << endl;
00255     min_size = min_size_;
00256   }
00257
00258   template <typename T, class Tstate>
00259   void detector<T,Tstate>::set_raw_thresholds(vector<float> &t) {
00260     mout << "Using multiple thresholds for raw bbox extractions: " << t << endl;
00261     raw_thresholds = t;
00262   }
00263
00264   template <typename T, class Tstate>
00265   void detector<T,Tstate>::
00266   set_nms(t_nms type, float pre_threshold_, float post_threshold,
00267           float pre_hfact, float pre_wfact, float post_hfact, float post_wfact,
00268           float woverh, float max_overlap, float max_hcenter_dist,
00269           float max_wcenter_dist, float vote_max_overlap,
00270           float vote_max_hcenter_dist, float vote_max_wcenter_dist) {
00271     pre_threshold = pre_threshold_;
00272     if (pnms) delete pnms;
00273     switch (type) {
00274     case nms_none: break ; // none
00275     case nms_overlap: // traditional overlap only
00276       pnms = new nms
00277         (post_threshold, max_overlap, max_hcenter_dist, max_wcenter_dist,
00278          pre_hfact, pre_wfact, post_hfact, post_wfact, woverh, mout, merr);
00279       break ;
00280     case nms_voting: // voting only
00281       pnms = new voting_nms
00282         (post_threshold, vote_max_overlap, vote_max_hcenter_dist,
00283          vote_max_wcenter_dist,
00284          pre_hfact, pre_wfact, post_hfact, post_wfact, woverh, mout, merr);
00285       break ;
00286     case nms_voting_overlap: // voting + traditional overlap
00287       pnms = new voting_nms
00288         (post_threshold, max_overlap, max_hcenter_dist, max_wcenter_dist,
00289          pre_hfact, pre_wfact, post_hfact, post_wfact, woverh,
00290          vote_max_overlap, vote_max_hcenter_dist, vote_max_wcenter_dist,
00291          mout, merr);
00292       break ;
00293     default: // unknown
00294       eblerror("unknown type of nms " << type);
00295     }
00296     mout << "Non-maximum suppression (nms): "
00297          << (pnms ? pnms->describe() : "none") << endl;
00298   }
00299
00300   template <typename T, class Tstate>
00301   void detector<T,Tstate>::set_scaler_mode(bool set) {
00302     scaler_mode = set;
00303     mout << "Scaler mode is "
00304          << (scaler_mode ? "enabled" : "disabled") << "." << endl;
00305   }
00306
00307   template <typename T, class Tstate>
00308   void detector<T,Tstate>::set_netdim(idxdim &d) {
00309     netdim = d;
00310     netdim.insert_dim(0, 1);
00311     netdim_fixed = true;
00312     mout << "Manually setting network's minimum input to " << d << endl;
00313   }
00314
00315   template <typename T, class Tstate>
00316   void detector<T,Tstate>::set_mem_optimization(Tstate &in, Tstate &out,
00317                                                 bool keep_inputs_) {
00318     eblwarn("mem optimization temporarly broken because out is now mstate");
00319     // mout << "Optimizing memory usage by using only 2 alternating buffers";
00320     // mem_optimization = true;
00321     // keep_inputs = keep_inputs_;
00322     // mout << " (and " << (keep_inputs ? "":"not ")
00323     //   << "keeping multi-scale inputs)";
00324     // minput = &in;
00325     // input = &in;
00326     // output = &out;
00327     // // remember if we need to swap buffers because of odd operations.
00328     // optimization_swap = !thenet.optimize_fprop(*input, *output);
00329     // mout << endl;
00330   }
00331
00332   template <typename T, class Tstate>
00333   void detector<T,Tstate>::set_outputs_dumping(const char *name) {
00334     outputs_dump = name;
00335   }
00336
00337   template <typename T, class Tstate>
00338   void detector<T,Tstate>::set_bboxes_off() {
00339     bboxes_off = true;
00340   }
00341
00342   template <typename T, class Tstate>
00343   vector<string>& detector<T,Tstate>::get_labels() {
00344     return labels;
00345   }
00346
00347   template <typename T, class Tstate>
00348   void detector<T,Tstate>::set_ignore_outsiders() {
00349     ignore_outsiders = true;
00350   }
00351
00352   template <typename T, class Tstate>
00353   void detector<T,Tstate>::set_corners_inference(uint type) {
00354     mout << "Setting corners inference type to " << type << endl;
00355     corners_inference = type;
00356   }
00357
00358   template <typename T, class Tstate>
00359   void detector<T,Tstate>::set_bbox_decision(uint type) {
00360     bbox_decision = type;
00361     mout << "Setting bbox decision type to " << type << endl;
00362   }
00363
00364   template <typename T, class Tstate>
00365   void detector<T,Tstate>::set_bbox_scalings(mfidxdim &scalings) {
00366     bbox_scalings = scalings;
00367     mout << "Setting bbox scalings to " << bbox_scalings << endl;
00368   }
00369
00371   // initialization
00372
00373   template <typename T, class Tstate>
00374   void detector<T,Tstate>::init(idxdim &dsample, const char *frame_name) {
00375     initialized = true;
00376     indim = dsample;
00377     // the network's minimum input dimensions
00378     if (!netdim_fixed)
00379       netdim = network_mindims(thenet, dsample.order());
00380     // mout << "Network's minimum input dimensions are: " << netdim
00381     //   << thenet.pretty(netdim) << endl;
00382     // minimum input dimensions: factor of network's minimum input
00383     idxdim mindim = netdim * min_scale;
00384     // if (mindim.dim(1) + hzpad * 2 < netdim.dim(1))
00385     //   mindim.setdim(1, netdim.dim(1) - hzpad * 2);
00386     // if (mindim.dim(2) + wzpad * 2 < netdim.dim(2))
00387     //   mindim.setdim(2, netdim.dim(2) - wzpad * 2);
00388     mindim.setdim(0, dsample.dim(0)); // feature dimension is not scaled
00389     // maximum input dimensions: factor of original input
00390     idxdim maxdim = dsample * max_scale;
00391     for (uint i = 1; i < maxdim.order(); ++i)
00392       if (maxdim.dim(i) < netdim.dim(i))
00393         maxdim.setdim(i, netdim.dim(i));
00394     maxdim.setdim(0, dsample.dim(0)); // feature dimension is not scaled
00395     // determine scales
00396     compute_scales(scales, netdim, mindim, maxdim, dsample, restype, nscales,
00397                    scales_step, frame_name);
00398     // reallocate buffers if number of scales has changed
00399     if (scales.size() != ppinputs.size()) {
00400       EDEBUG("reallocating input and output buffers");
00401       DEBUGMEM_PRETTY("detector init scales");
00402       ppinputs.clear();
00403       outputs.clear();
00404       actual_scales.clear();
00405       // allocate buffers
00406       idxdim order(mindim);
00407       order.setdims(1); // minimum dims
00408       for (uint i = 0; i < scales.size(); ++i) {
00409         mstate<Tstate> *ppin = new mstate<Tstate>();
00410         ppin->push_back(new Tstate(order));
00411         ppinputs.push_back(ppin);
00412         mstate<Tstate> *ppout = new mstate<Tstate>();
00413         ppout->push_back(new Tstate(order));
00414         outputs.push_back(ppout);
00415       }
00416       DEBUGMEM_PRETTY("detector end of init scales");
00417       // copy ideal scales to actual scales vector (to be modified later)
00418       actual_scales.copy(scales);
00419     }
00420   }
00421
00423   // scaling methods
00424
00425   template <typename T, class Tstate>
00426   void detector<T,Tstate>::
00427   compute_scales(midxdim &scales, idxdim &netdim, idxdim &mindim,
00428                  idxdim &maxdim, idxdim &indim, t_scaling type, uint nscales,
00429                  double scales_step, const char *frame_name) {
00430     // fill scales based on scaling type
00431     scales.clear();
00432     if (!silent)
00433       mout << "Scales: input: " << indim << " min: " << netdim
00434            << " max: " << maxdim << endl
00435            << "Scaling type " << type << ": ";
00436     switch (type) {
00437     case ORIGINAL:
00438       if (!silent) mout << "1 scale only, the image's original scale." << endl;
00439       scales.push_back(indim);
00440       break ;
00441     case MANUAL:
00442       scales = manual_scales;
00443       if (!silent)
00444         mout << "Manual specification of each scale size to: " << scales <<endl;
00445       break ;
00446     case SCALES:
00447       if (!silent)
00448         mout << "Manual specification of each scale factor applied to "
00449              << "original dimensions." << endl;
00450       compute_resolutions(scales, indim, scale_factors);
00451       break ;
00452     case NSCALES: // n scale between min and max resolutions
00453       if (!silent)
00454         mout << nscales << " scales between min (" << netdim
00455              << ") and max (" << maxdim << ") scales." << endl;
00456       compute_resolutions(scales, netdim, maxdim, nscales);
00457       break ;
00458     case SCALES_STEP: // step fixed amount from scale from max down to min
00459       if (!silent)
00460         mout << "Scale step of " << scales_step << " from max (" << maxdim
00461              << ") down to min (" << mindim << ") scale." << endl;
00462       compute_resolutions(scales, mindim, maxdim, scales_step);
00463       break ;
00464     case SCALES_STEP_UP: // step fixed amount from scale min up to max
00465       if (!silent)
00466         mout << "Scale step of " << scales_step << " from min (" << mindim
00467              << ") up to max (" << maxdim << ") scale." << endl;
00468       compute_resolutions_up(scales, indim, mindim, maxdim, scales_step);
00469       break ;
00470     case NETWORK:
00471       if (!silent)
00472         mout << "Resize all inputs to network's minimal size" << endl;
00473       scales.push_back(netdim);
00474       break ;
00475     default: eblerror("unknown scaling mode");
00476     }
00477     // limit scales with max_size
00478     for (midxdim::iterator i = scales.begin(); i != scales.end(); ) {
00479       idxdim d = *i;
00480       if (max_size > 0 && (d.dim(1) > max_size || d.dim(2) > max_size)) {
00481         scales.erase(i);
00482         mout << "removing scale " << d << " because of max size " << max_size
00483              << endl;
00484       } else i++;
00485     }
00486     // initialize original bboxes to entire image
00487     rect<int> bb(0, 0, indim.dim(1), indim.dim(2));
00488     for (uint i = 0; i < scales.size(); ++i)
00489       original_bboxes.push_back(bb);
00490     // print scales
00491     mout << "Detection initialized to ";
00492     if (adapt_scales) mout << "(network-adapted scales) ";
00493     if (scales.size() == 0) mout << "0 resolutions." << endl;
00494     else mout << scales.size() << " input resolutions: " << scales;
00495     mout << endl;
00496     if (scales.size() == 0)
00497       eblthrow("0 resolutions to compute in " << frame_name);
00498   }
00499
00500   template <typename T, class Tstate>
00501   void detector<T,Tstate>::
00502   compute_resolutions(midxdim &scales,
00503                       idxdim &mindim, idxdim &maxdim, uint nscales) {
00504     scales.clear();
00505     if (nscales == 0)
00506       eblerror("expected at least 1 scale but found " << nscales);
00507     // nscales must be less than the minimum pixel distance between min and max
00508     uint max_res = std::min(maxdim.dim(1) - mindim.dim(1),
00509                             maxdim.dim(2) - mindim.dim(2));
00510     if (nscales > max_res) {
00511       merr << "warning: the number of resolutions requested (";
00512       merr << nscales << ") is more than";
00513       merr << " the minimum distance between minimum and maximum possible";
00514       merr << " resolutions. (min: " << mindim << " max: " << maxdim;
00515       if (mindim == maxdim)
00516         nscales = 1;
00517       else
00518         nscales = 2;
00519       merr << ") setting it to " << nscales << endl;
00520     }
00521     // only 1 scale if min == max or if only 1 scale requested.
00522     if ((mindim == maxdim) || (nscales == 1))
00523       scales.push_back(maxdim);
00524     else if (nscales == 2) { // 2 resolutions: min and max
00525       scales.push_back(mindim);
00526       scales.push_back(maxdim);
00527     } else { // multiple scales: interpolate between min and max
00528       // compute the step factor: x = e^(log(max/min)/(nres-1))
00529       double fact = MIN(maxdim.dim(1) / (double) mindim.dim(1),
00530                         maxdim.dim(2) / (double) mindim.dim(2));
00531       double step = exp(log(fact)/(nscales - 1));
00532       double f;
00533       uint i;
00534       for (f = step, i = 1; i < nscales; ++i, f *= step) {
00535         idxdim d = maxdim * (1 / f);
00536         d.setdim(0, maxdim.dim(0)); // do not scale feature dimension
00537         scales.push_back(d);
00538       }
00539       scales.push_back(maxdim);
00540     }
00541   }
00542
00543   template <typename T, class Tstate>
00544   void detector<T,Tstate>::
00545   compute_resolutions(midxdim &scales,
00546                       idxdim &indims, vector<double> &scale_factors) {
00547     scales.clear();
00548     if (scale_factors.size() == 0)
00549       eblerror("expected at least 1 scale factor but found "
00550                << scale_factors.size());
00551     // compute scales
00552     for (uint i = 0; i < scale_factors.size(); ++i) {
00553       idxdim d = indims * scale_factors[i];
00554       d.setdim(0, indims.dim(0)); // do not scale feature dimension
00555       scales.push_back(d);
00556     }
00557   }
00558
00559   template <typename T, class Tstate>
00560   void detector<T,Tstate>::
00561   compute_resolutions(midxdim &scales, idxdim &mindim, idxdim &maxdim,
00562                       double scales_step) {
00563     scales.clear();
00564     double factor = 1 / scales_step;
00565     // take steps from max scale until reaching min scale
00566     idxdim d = maxdim;
00567     scales.push_back(d);
00568     d = d * factor;
00569     d.setdim(0, maxdim.dim(0)); // do not scale feature dimension
00570     while (d >= mindim) {
00571       scales.push_back(d);
00572       d = d * factor;
00573       d.setdim(0, maxdim.dim(0)); // do not scale feature dimension
00574     }
00575   }
00576
00577   template <typename T, class Tstate>
00578   void detector<T,Tstate>::
00579   compute_resolutions_up(midxdim &scales, idxdim &indim, idxdim &mindim,
00580                          idxdim &maxdim, double scales_step) {
00581     scales.clear();
00582     double factor = std::max(mindim.dim(1) / (double) indim.dim(1),
00583                              mindim.dim(2) / (double) indim.dim(2));
00584     idxdim d = indim * factor;
00585     d.setdim(0, maxdim.dim(0)); // do not scale feature dimension
00586     while (d <= maxdim) {
00587       d.set_max(mindim); // make sure each dimension is bigger than mindim
00588       scales.push_front_new(d);
00589       factor *= scales_step;
00590       d = indim * factor;
00591       d.setdim(0, maxdim.dim(0)); // do not scale feature dimension
00592     }
00593   }
00594
00596   // outputs smoothing
00597
00598   template <typename T, class Tstate>
00599   void detector<T,Tstate>::set_smoothing(uint type) {
00600     smoothing_type = type;
00601     idx<T> ker;
00602     switch (smoothing_type) {
00603     case 0: mout << "Outputs smoothing disabled." << endl; break ;
00604     case 1:
00605       ker = idx<T>(3, 3);
00606       ker.set(.3, 0, 0);
00607       ker.set(.5, 0, 1);
00608       ker.set(.3, 0, 2);
00609       ker.set(.5, 1, 0);
00610       ker.set(1 , 1, 1);
00611       ker.set(.5, 1, 2);
00612       ker.set(.3, 2, 0);
00613       ker.set(.5, 2, 1);
00614       ker.set(.3, 2, 2);
00615       idx_dotc(ker, (T) (1 / (double) idx_sum(ker)), ker);
00616       smoothing_kernel = ker;
00617       mout << "Smoothing outputs with kernel: " << endl;
00618       smoothing_kernel.printElems();
00619       break ;
00620     default:
00621       eblerror("Unknown smoothing type " << type);
00622     }
00623   }
00624
00625   template <typename T, class Tstate>
00626   void detector<T,Tstate>::smooth_outputs() {
00627     if (smoothing_type != 0) {
00628       eblerror("smoothing temporarly broken");
00629       // FIXME! (outputs is no longer a single output)
00630       // uint hpad = (uint) (smoothing_kernel.dim(0) / 2);
00631       // uint wpad = (uint) (smoothing_kernel.dim(1) / 2);
00632       // for (uint i = 0; i < outputs.size(); ++i) {
00633       //        idx<T> &outx = outputs[i]->x;
00634       //        intg h = outx.dim(1), w = outx.dim(2);
00635       //        idx<T> in(h + 2 * hpad, w + 2 * wpad);
00636       //        idx<T> inc = in.narrow(0, h, hpad);
00637       //        inc = inc.narrow(1, w, wpad);
00638       //        idx_clear(in);
00639       //        idx_bloop1(out, outx, T) {
00640       //          idx_copy(out, inc);
00641       //          idx_2dconvol(in, smoothing_kernel, out);
00642       //        }
00643       // }
00644     }
00645   }
00646
00647   // template <typename T, class Tstate>
00648   // void detector<T,Tstate>::extract_bboxes(T threshold, bboxes &bbs) {
00649   //   bbox::init_instance_id(); // reset unique ids to start from zero.
00650   //   // make a list that contains the results
00651   //   double original_h = indim.dim(1);
00652   //   double original_w = indim.dim(2);
00653   //   intg offset_h = 0, offset_w = 0;
00654   //   int scale_index = 0;
00655   //   for (uint i = 0; i < ppinputs.size(); ++i) {
00656   //     bboxes bbtmp;
00657   //     // select elements
00658   //     Tstate &input = (*(ppinputs[i]))[0];
00659   //     Tstate &output = *(outputs[i]);
00660   //     rect<int> &robbox = original_bboxes[i];
00661   //     // sizes
00662   //     double in_h = (double) input.x.dim(1);
00663   //     double in_w = (double) input.x.dim(2);
00664   //     double out_h = (double) output.x.dim(1);
00665   //     double out_w = (double) output.x.dim(2);
00666   //     double neth = netdim.dim(1); // network's input height
00667   //     double netw = netdim.dim(2); // network's input width
00668   //     double scalehi = original_h / robbox.height; // input to original
00669   //     double scalewi = original_w / robbox.width; // input to original
00670   //     int image_h0 = (int) (robbox.h0 * scalehi);
00671   //     int image_w0 = (int) (robbox.w0 * scalewi);
00672   //     // offset factor in input map
00673   //     double offset_h_factor = (in_h - neth) / std::max((double)1, (out_h - 1));
00674   //     double offset_w_factor = (in_w - netw) / std::max((double)1, (out_w - 1));
00675   //     offset_w = 0;
00676   //     Tstate out(output.x.get_idxdim());
00677   //     answer.fprop(output, out);
00678   //     // loop on width
00679   //     idx_eloop1(ro, out.x, T) {
00680   //    offset_h = 0;
00681   //    // loop on height
00682   //    idx_eloop1(roo, ro, T) {
00683   //      int classid = (int) roo.get(0);
00684   //      float conf = (float) roo.get(1);
00685   //      // if ((offset_h == out_h - 1 || (int)(offset_h) % 3 == 0)
00686   //      //     && (offset_w == out_w - 1 || (int)(offset_w) % 3 == 0)) {
00687   //      // if (true) {
00688   //      if (conf >= threshold && classid != bgclass) {
00689   //        bbox bb;
00690   //        bb.class_id = classid; // Class
00691   //        bb.confidence = conf; // Confidence
00692   //        bb.scale_index = scale_index; // scale index
00693   //        // predicted offsets / scale
00694   //        float hoff = 0, woff = 0, scale = 1.0;
00695   //        if (scaler_mode) {
00696   //          scale = (float) roo.gget(2);
00697   //          if (roo.dim(0) == 5) { // class,conf,scale,h,w
00698   //            hoff = roo.gget(3) * neth;
00699   //            woff = roo.gget(4) * neth;
00700   //          }
00701   //          // cap scale
00702   //          scale = std::max(min_scale_pred, std::min(max_scale_pred, scale));
00703   //          scale = 1 / scale;
00704   //        }
00705   //        EDEBUG(roo.str());
00706   //        // original box in input map
00707   //        bb.iheight = (int) in_h; // input h
00708   //        bb.iwidth = (int) in_w; // input w
00709   //        bb.i0.h0 = (float) (offset_h * offset_h_factor);
00710   //        bb.i0.w0 = (float) (offset_w * offset_w_factor);
00711   //        bb.i0.height = (float) neth;
00712   //        bb.i0.width = (float) netw;
00713   //        // output map
00714   //        bb.oheight = (int) out_h; // output height
00715   //        bb.owidth = (int) out_w; // output width
00716   //        bb.o.h0 = offset_h; // answer height in output
00717   //        bb.o.w0 = offset_w; // answer height in output
00718   //        // bb.o.h0 = 0;
00719   //        // bb.o.w0 = 0;
00720   //        // bb.o.h0 = out_h - 1;
00721   //        // bb.o.w0 = out_w - 1;
00722   //        bb.o.height = 1;
00723   //        bb.o.width = 1;
00724
00725   //        // transformed box in input map
00726   //        bb.i.h0 = bb.i0.h0 + hoff;
00727   //        bb.i.w0 = bb.i0.w0 + woff;
00728   //        bb.i.height = bb.i0.height;
00729   //        bb.i.width = bb.i0.width;
00730   //        if (scale != 1.0)
00731   //          bb.i.scale_centered(scale, scale);
00732
00733   //        // infer original location through network
00734   //        idxdim d(1, bb.o.height, bb.o.width);
00735   //        d.setoffset(1, bb.o.h0);
00736   //        d.setoffset(2, bb.o.w0);
00737   //        mfidxdim md(d);
00738   //        mfidxdim d2 = thenet.bprop_size(md);
00739   //        fidxdim loc = d2[0];
00740   //        bb.i.h0 = loc.offset(1);
00741   //        bb.i.w0 = loc.offset(2);
00742   //        bb.i.height = loc.dim(1);
00743   //        bb.i.width = loc.dim(2);
00744
00745   //        // add all input boxes
00746   //        for (uint q = 0; q < d2.size(); ++q)
00747   //          bb.mi.push_back(rect<float>(d2[q].offset(1), d2[q].offset(2),
00748   //                                      d2[q].dim(1), d2[q].dim(2)));
00749
00750   //        // bb.h0 = loc.offset(1) * scalehi;
00751   //        // bb.w0 = loc.offset(2) * scalewi;
00752   //        // bb.height = loc.dim(1) * scalehi;
00753   //        // bb.width = loc.dim(2) * scalewi;
00754
00755
00756   //        // original image
00757   //        // bbox's rectangle in original image
00758   //        // bb.h0 = bb.i.h0 * scalehi;
00759   //        // bb.w0 = bb.i.w0 * scalewi;
00760   //        bb.h0 = bb.i.h0 * scalehi - image_h0;
00761   //        bb.w0 = bb.i.w0 * scalewi - image_w0;
00762   //        bb.height = bb.i.height * scalehi;
00763   //        bb.width = bb.i.width * scalewi;
00764   //        // push bbox to list
00765   //        bbtmp.push_back(new bbox(bb));
00766   //      }
00767   //      offset_h++;
00768   //    }
00769   //    offset_w++;
00770   //     }
00771   //     // add scale boxes into all boxes
00772   //     for (uint k = 0; k < bbtmp.size(); ++k)
00773   //    bbs.push_back(bbtmp[k]);
00774   //     scale_index++;
00775   //   }
00776   // }
00777
00778   template <typename T, class Tstate>
00779   void detector<T,Tstate>::update_merge_alignment() {
00780     // check presence of merging module
00781     flat_merge_module<T,Tstate> *merger = NULL;
00782     vector<flat_merge_module<T,Tstate>*> mergers =
00783       arch_find_all(&thenet, merger);
00784     if (mergers.size() > 0) {
00785       mout << "Found merging module(s) in network: " << mergers << endl;
00786       for (uint i = 0; i < mergers.size(); ++i)
00787         mout << mergers[i]->describe()<< endl;
00788     } else {
00789       mout << "No merging module found in network." << endl;
00790       return ;
00791     }
00792     // align for each merger module
00793     for (uint i = 0; i < mergers.size(); ++i) {
00794       merger = mergers[i];
00795       // get the network narrowed up to the merger module (included)
00796       module_1_1<T,Tstate> *merger_net_included = arch_narrow(&thenet, merger);
00797       module_1_1<T,Tstate> *merger_net = arch_narrow(&thenet, merger, false);
00798       if (!merger_net || !merger_net_included)
00799         eblerror("failed to narrow network up to " << merger);
00800       EDEBUG("network narrowed up to merger module: " << merger->name());
00801       mout << "Aligning merging centers on top left image origin." << endl;
00802       //    for (uint i = 0; i < merger->get_ninputs(); ++i) {
00803       fidxdim c(1, 1, 1), f(1, 1, 1), c0, c1;
00804       mfidxdim m(c), m0, m0m, m1, paddings; //(merger->get_ninputs());
00805       // determine input size and location of output pixel (0,0)
00806       mfidxdim mf(f);
00807       mf = resizepp->fprop_size(mf);
00808       merger_net_included->fprop_size(mf);
00809       m0m = merger->bprop_size(m);
00810       EDEBUG(merger_net->name() << " m0m: " << m0m);
00811       mfidxdim scales = merger->get_scales();
00812       // EDEBUG("strides: " << strides);
00813       vector<vector<int> > alloff;
00814       mfidxdim allstrides;
00815       float hs0 = 1, ws0 = 1;
00816       for (uint k = 0; k < m0m.size(); ++k) {
00817         //uint i = k - (k % 2);
00818         uint i = k;
00819         mfidxdim mm(m0m.size());
00820         mm.set_new(m0m[i], i);
00821         EDEBUG("mm: " << mm);
00822         // determine input size and location of output pixel (0,0)
00823         //merger_net_included->fprop_size(mf);
00824         m0 = merger_net->bprop_size(mm);
00825         // m0 = resizepp->bprop_size(m0);
00826         m0.remove_empty();
00827         // determine input size and location of output pixel (1,1)
00828         mm[i].setoffset(1, 1);
00829         mm[i].setoffset(2, 1);
00830         //merger_net_included->fprop_size(mf);
00831         m1 = merger_net->bprop_size(mm);
00832         // m1 = resizepp->bprop_size(m1);
00833         m1.remove_empty();
00834         EDEBUG("m0: " << m0);
00835         EDEBUG("m1: " << m1);
00836
00837
00838         //      uint fact = (uint) ceil(strides.size() / (float) m0.size());
00839         // c0 = m0[i / fact];
00840         // c1 = m1[i / fact];
00841         c0 = m0[0];
00842         c1 = m1[0];
00843         //fidxdim &stride = strides[i];
00844         // determine center of output pixel (0,0) in input space
00845         rect<float> p0(c0.offset(1), c0.offset(2), c0.dim(1), c0.dim(2));
00846         float hc = p0.hcenter(), wc = p0.wcenter();
00847
00848         // // determine input pixel (0,0) in output space
00849         // fidxdim i0(1, 1)
00850
00851
00852         // if (hc < 0) {
00853         //   eblwarn("expected center's height to be >= 0 but got " << hc);
00854         //   hc = 1;
00855         // }
00856         // if (wc < 0) {
00857         //   eblwarn("expected center's width to be >= 0 but got " << wc);
00858         //   wc = 1;
00859         // }
00860         // determine stride of output space in input space
00861         float hs = (c1.offset(1) - c0.offset(1));// / scales[i].dim(0);
00862         float ws = (c1.offset(2) - c0.offset(2));// / scales[i].dim(0);
00863         // if (k == 0) {
00864         //   hs = hs0 / (scales[i].dim(0));
00865         //   ws = ws0 / (scales[i].dim(0));
00866         //   hs0 = hs;
00867         //   ws0 = ws;
00868         // } else {
00869         //   hs = hs0 / (scales[i].dim(0) * hs);
00870         //   ws = ws0 / (scales[i].dim(0) * ws);
00871         // }
00872
00873         if (k == 0) {
00874           hs0 = hs;
00875           ws0 = ws;
00876         }
00877         float hos = hs0 / (scales[i].dim(0) * hs);
00878         float wos = ws0 / (scales[i].dim(0) * ws);
00879         fidxdim fi(hos, wos);
00880         allstrides.push_back_new(fi);
00881
00882         // set paddings of merger
00883         //fidxdim pads(hc * hs, wc * ws, hc * hs, wc * ws);
00884
00885         vector<int> offs;
00886
00887         offs.push_back((int)(hc/hs));
00888         offs.push_back((int)(wc/ws));
00889         offs.push_back((int)(hc/hs));
00890         offs.push_back((int)(wc/ws));
00891
00892         // offs.push_back((int)(hc*hos*scales[i].dim(0)));
00893         // offs.push_back((int)(wc*wos*scales[i].dim(0)));
00894         // offs.push_back((int)(hc*hos*scales[i].dim(0)));
00895         // offs.push_back((int)(wc*wos*scales[i].dim(0)));
00896
00897         // offs.push_back((int)(hc*hos/hs0));
00898         // offs.push_back((int)(wc*wos/ws0));
00899         // offs.push_back((int)(hc*hos/hs0));
00900         // offs.push_back((int)(wc*wos/ws0));
00901         alloff.push_back(offs);
00902         // fidxdim pads(stride.dim(0) * hc / hs, stride.dim(1) * wc / ws,
00903         //           stride.dim(0) * hc / hs, stride.dim(1) * wc / ws);
00904         // fidxdim pads(stride.dim(0) * hc / hs, stride.dim(1) * wc / ws, 0, 0);
00905         //      paddings.push_back_new(pads);
00906         mout << merger->name() << "'s input " << i << " must be padded/narrowed with "
00907              << offs << " to recenter " << p0 << " (center " << hc << "x" << wc
00908              << "), (output stride is " << hs << "x" << ws << ")" << std::endl;
00909       }
00910       merger->set_offsets(alloff);
00911       merger->set_strides(allstrides);
00912     }
00913   }
00914
00915   template <typename T, class Tstate>
00916   void detector<T,Tstate>::get_corners(mstate<Tstate> &outputs) {
00917     if (!corners_infered) {
00918       if (corners_inference == 0 || corners_inference == 1) { // infer from net
00919         uint n = 0;
00920         scale_indices.clear();
00921         for (typename mstate<Tstate>::iterator o = outputs.begin();
00922              o != outputs.end(); ++o) {
00923           fidxdim d(o->x.get_idxdim());
00924           fidxdim c(1, 1, 1), mc0;
00925           mfidxdim mc(outputs.size());
00926           mc.set_new(c, n);
00927           mfidxdim m;
00928           // top left
00929           m = thenet.bprop_size(mc);
00930           m.remove_empty();
00931           mc0 = m[0];
00932           itl.push_back_new(mc0);
00933           m = resizepp->get_msize();
00934           // infer scale index for this output
00935           for (uint i = 0; i < m.size(); ++i)
00936             if (m.exists(i)) {
00937               scale_indices.push_back(i);
00938               break ;
00939             }
00940           m.remove_empty();
00941           mc0 = m[0];
00942           pptl.push_back_new(mc0);
00943           // top right
00944           mc[n].setoffset(2, d.dim(2));
00945           m = thenet.bprop_size(mc);
00946           m.remove_empty();
00947           mc0 = m[0];
00948           itr.push_back_new(mc0);
00949           m = resizepp->get_msize();
00950           m.remove_empty();
00951           mc0 = m[0];
00952           pptr.push_back_new(mc0);
00953           // bottom left
00954           mc[n].setoffset(1, d.dim(1));
00955           mc[n].setoffset(2, 0);
00956           m = thenet.bprop_size(mc);
00957           m.remove_empty();
00958           mc0 = m[0];
00959           ibl.push_back_new(mc0);
00960           m = resizepp->get_msize();
00961           m.remove_empty();
00962           mc0 = m[0];
00963           ppbl.push_back_new(mc0);
00964           // bottom right
00965           mc[n].setoffset(1, d.dim(1));
00966           mc[n].setoffset(2, d.dim(2));
00967           m = thenet.bprop_size(mc);
00968           m.remove_empty();
00969           mc0 = m[0];
00970           ibr.push_back_new(mc0);
00971           m = resizepp->get_msize();
00972           m.remove_empty();
00973           mc0 = m[0];
00974           ppbr.push_back_new(mc0);
00975           ++n;
00976         }
00977         EDEBUG("top left output " << itl);
00978         EDEBUG("top right output " << itr);
00979         EDEBUG("bottom left output " << ibl);
00980         EDEBUG("bottom right output " << ibr);
00981
00982         if (corners_inference == 1) { // from net + save corners
00983           // save corners to matrix
00984           idx<float> scorners(itl.size(), 4, 4);
00985           for (uint i = 0; i < itl.size(); ++i) {
00986             scorners.set(itl[i].offset(1), i, 0, 0);
00987             scorners.set(itl[i].offset(2), i, 0, 1);
00988             scorners.set(itl[i].dim(1), i, 0, 2);
00989             scorners.set(itl[i].dim(2), i, 0, 3);
00990             scorners.set(itr[i].offset(1), i, 1, 0);
00991             scorners.set(itr[i].offset(2), i, 1, 1);
00992             scorners.set(itr[i].dim(1), i, 1, 2);
00993             scorners.set(itr[i].dim(2), i, 1, 3);
00994             scorners.set(ibl[i].offset(1), i, 2, 0);
00995             scorners.set(ibl[i].offset(2), i, 2, 1);
00996             scorners.set(ibl[i].dim(1), i, 2, 2);
00997             scorners.set(ibl[i].dim(2), i, 2, 3);
00998             scorners.set(ibr[i].offset(1), i, 3, 0);
00999             scorners.set(ibr[i].offset(2), i, 3, 1);
01000             scorners.set(ibr[i].dim(1), i, 3, 2);
01001             scorners.set(ibr[i].dim(2), i, 3, 3);
01002           }
01003           save_matrix(scorners, "corners.mat");
01004         }
01005         corners_infered = true;
01006       } else if (corners_inference == 2) { // load corners
01007         // load corners from matrix
01008         idx<float> corners = load_matrix<float>("corners.mat");
01009         itl.clear(); itr.clear(); ibl.clear(); ibr.clear();
01010         for (uint i = 0; i < corners.dim(0); ++i) {
01011           // allocate
01012           fidxdim d(outputs[0].x.get_idxdim());
01013           d.setdims(1);
01014           itl.push_back_new(d);
01015           itr.push_back_new(d);
01016           ibl.push_back_new(d);
01017           ibr.push_back_new(d);
01018           // set
01019           itl[i].setoffset(1, corners.get(i, 0, 0));
01020           itl[i].setoffset(2, corners.get(i, 0, 1));
01021           itl[i].setdim(1, corners.get(i, 0, 2));
01022           itl[i].setdim(2, corners.get(i, 0, 3));
01023           itr[i].setoffset(1, corners.get(i, 1, 0));
01024           itr[i].setoffset(2, corners.get(i, 1, 1));
01025           itr[i].setdim(1, corners.get(i, 1, 2));
01026           itr[i].setdim(2, corners.get(i, 1, 3));
01027           ibl[i].setoffset(1, corners.get(i, 2, 0));
01028           ibl[i].setoffset(2, corners.get(i, 2, 1));
01029           ibl[i].setdim(1, corners.get(i, 2, 2));
01030           ibl[i].setdim(2, corners.get(i, 2, 3));
01031           ibr[i].setoffset(1, corners.get(i, 3, 0));
01032           ibr[i].setoffset(2, corners.get(i, 3, 1));
01033           ibr[i].setdim(1, corners.get(i, 3, 2));
01034           ibr[i].setdim(2, corners.get(i, 3, 3));
01035         }
01036         corners_infered = true;
01037       }
01038     }
01039   }
01040
01041   template <typename T, class Tstate>
01042   void detector<T,Tstate>::extract_bboxes(T threshold, bboxes &bbs) {
01043     bbox::init_instance_id(); // reset unique ids to start from zero.
01044     // make a list that contains the results
01045     double original_h = indim.dim(1);
01046     double original_w = indim.dim(2);
01047     intg offset_h = 0, offset_w = 0;
01048     int scale_index = 0;
01049     // get 4 corners coordinates for each scale
01050     mstate<Tstate> &oo = outputs[0];
01051     answers.clear();
01052     get_corners(oo);
01053
01054     // loop on output
01055     for (uint o = 0; o < oo.size(); ++o) {
01056       if (o < raw_thresholds.size()) threshold = raw_thresholds[o];
01057       float thresh = threshold;
01058       // Tstate &input = ppinputs[0][0];
01059       Tstate &output = oo[o];
01060       idx<T> outx = output.x;
01061       fidxdim &tl = itl[o], &tr = itr[o], &bl = ibl[o];
01062       fidxdim &ptl = pptl[o], &ptr = pptr[o], &pbl = ppbl[o];
01063       // fidxdim &br = ibr[o];
01064
01065       // steps in input space
01066       double hf = (bl.offset(1) - tl.offset(1)) / outx.dim(1);
01067       double wf = (tr.offset(2) - tl.offset(2)) / outx.dim(2);
01068       // steps in preprocessed space
01069       double phf = (pbl.offset(1) - ptl.offset(1)) / outx.dim(1);
01070       double pwf = (ptr.offset(2) - ptl.offset(2)) / outx.dim(2);
01071
01072       // box scalings
01073       double hscaling = 1.0, wscaling = 1.0;
01074       if (o < bbox_scalings.size()) {
01075         fidxdim &scaling = bbox_scalings[o];
01076         hscaling = scaling.dim(0);
01077         wscaling = scaling.dim(1);
01078       }
01079
01080       bboxes bbtmp;
01081       // select elements
01082       // rect<int> &robbox = original_bboxes[0];
01083       // sizes
01084       // double in_h = (double) input.x.dim(1);
01085       // double in_w = (double) input.x.dim(2);
01086       // double out_h = (double) output.x.dim(1);
01087       // double out_w = (double) output.x.dim(2);
01088       // double neth = netdim.dim(1); // network's input height
01089       // double netw = netdim.dim(2); // network's input width
01090       // double scalehi = original_h / robbox.height; // input to original
01091       // double scalewi = original_w / robbox.width; // input to original
01092       // int image_h0 = (int) (robbox.h0 * scalehi);
01093       // int image_w0 = (int) (robbox.w0 * scalewi);
01094       // offset factor in input map
01095       // double offset_h_factor = (in_h - neth) / std::max((double)1, (out_h - 1));
01096       // double offset_w_factor = (in_w - netw) / std::max((double)1, (out_w - 1));
01097       offset_w = 0;
01098       Tstate out(outx.get_idxdim());
01099       answer->fprop(output, out);
01100       answers.push_back_new(out);
01101
01102       idx<T> tmp = outx.select(0, 1);
01103       cout << "out " << o << " threshold " << thresh << " min " << idx_min(tmp)
01104            << " max " << idx_max(tmp) << endl;
01105
01106       // loop on width
01107       idx_eloop1(ro, out.x, T) {
01108         offset_h = 0;
01109         // loop on height
01110         idx_eloop1(roo, ro, T) {
01111           int classid = (int) roo.get(0);
01112           float conf = (float) roo.get(1);
01113           bool accept = false;
01114           // select decision criterion
01115           switch (bbox_decision) {
01116           case 0: accept = (conf >= thresh && classid != bgclass); break ;
01117           case 1: accept = ((offset_h == outx.dim(1) - 1 && offset_w == 0) ||
01118                             (offset_h == 0 && offset_w == 0) ||
01119                             (offset_h == outx.dim(1) - 1
01120                              && offset_w == outx.dim(2) - 1) ||
01121                             (offset_h == 0 && offset_w == outx.dim(2) - 0));
01122             break;
01123           case 2: accept = ((offset_h == outx.dim(1) - 1
01124                              && offset_w == outx.dim(2) - 1));
01125             break;
01126           default: eblerror("unknown bbox decision type");
01127           }
01128           if (accept) {
01129             bbox bb;
01130             bb.class_id = classid; // Class
01131             bb.confidence = conf; // Confidence
01132             bb.iscale_index = scale_indices[scale_index]; // scale index
01133             bb.oscale_index = scale_index; // scale index
01134
01135             bb.h0 = tl.offset(1) + offset_h * hf;
01136             bb.w0 = tl.offset(2) + offset_w * wf;
01137             bb.height = tl.dim(1);
01138             bb.width = tl.dim(2);
01139             bb.scale_centered(hscaling, wscaling);
01140
01141             bb.i.h0 = ptl.offset(1) + offset_h * phf;
01142             bb.i.w0 = ptl.offset(2) + offset_w * pwf;
01143             bb.i.height = ptl.dim(1);
01144             bb.i.width = ptl.dim(2);
01145
01146             // // predicted offsets / scale
01147             // float hoff = 0, woff = 0, scale = 1.0;
01148             // if (scaler_mode) {
01149             //   scale = (float) roo.gget(2);
01150             //   if (roo.dim(0) == 5) { // class,conf,scale,h,w
01151             //  hoff = roo.gget(3) * neth;
01152             //  woff = roo.gget(4) * neth;
01153             //   }
01154             //   // cap scale
01155             //   scale = std::max(min_scale_pred, std::min(max_scale_pred, scale));
01156             //   scale = 1 / scale;
01157             // }
01158             // EDEBUG(roo.str());
01159             // // original box in input map
01160             // bb.iheight = (int) in_h; // input h
01161             // bb.iwidth = (int) in_w; // input w
01162             // bb.i0.h0 = (float) (offset_h * offset_h_factor);
01163             // bb.i0.w0 = (float) (offset_w * offset_w_factor);
01164             // bb.i0.height = (float) neth;
01165             // bb.i0.width = (float) netw;
01166             // output map
01167             // bb.oheight = (int) out_h; // output height
01168             // bb.owidth = (int) out_w; // output width
01169             bb.o.h0 = offset_h; // answer height in output
01170             bb.o.w0 = offset_w; // answer height in output
01171             bb.o.height = 1;
01172             bb.o.width = 1;
01173             // // bb.o.h0 = 0;
01174             // // bb.o.w0 = 0;
01175             // // bb.o.h0 = out_h - 1;
01176             // // bb.o.w0 = out_w - 1;
01177
01178             // // transformed box in input map
01179             // bb.i.h0 = bb.i0.h0 + hoff;
01180             // bb.i.w0 = bb.i0.w0 + woff;
01181             // bb.i.height = bb.i0.height;
01182             // bb.i.width = bb.i0.width;
01183             // if (scale != 1.0)
01184             //   bb.i.scale_centered(scale, scale);
01185
01186             // // infer original location through network
01187             // idxdim d(1, bb.o.height, bb.o.width);
01188             // d.setoffset(1, bb.o.h0);
01189             // d.setoffset(2, bb.o.w0);
01190             // mfidxdim md(d);
01191             // mfidxdim d2 = thenet.bprop_size(md);
01192             // fidxdim loc = d2[0];
01193             // bb.i.h0 = loc.offset(1);
01194             // bb.i.w0 = loc.offset(2);
01195             // bb.i.height = loc.dim(1);
01196             // bb.i.width = loc.dim(2);
01197
01198             // // add all input boxes
01199             // for (uint q = 0; q < d2.size(); ++q)
01200             //   bb.mi.push_back(rect<float>(d2[q].offset(1), d2[q].offset(2),
01201             //                            d2[q].dim(1), d2[q].dim(2)));
01202
01203             // // bb.h0 = loc.offset(1) * scalehi;
01204             // // bb.w0 = loc.offset(2) * scalewi;
01205             // // bb.height = loc.dim(1) * scalehi;
01206             // // bb.width = loc.dim(2) * scalewi;
01207
01208
01209             // // original image
01210             // // bbox's rectangle in original image
01211             // // bb.h0 = bb.i.h0 * scalehi;
01212             // // bb.w0 = bb.i.w0 * scalewi;
01213             // bb.h0 = bb.i.h0 * scalehi - image_h0;
01214             // bb.w0 = bb.i.w0 * scalewi - image_w0;
01215             // bb.height = bb.i.height * scalehi;
01216             // bb.width = bb.i.width * scalewi;
01217
01218             bool ignore = false;
01219             if (ignore_outsiders) { // ignore boxes that overlap outside
01220               if (bb.h0 < 0 || bb.w0 < 0
01221                   || bb.h0 + bb.height > original_h
01222                   || bb.w0 + bb.width > original_w)
01223                 ignore = true;
01224             }
01225
01226             // push bbox to list
01227             if (!ignore)
01228               bbtmp.push_back(new bbox(bb));
01229           }
01230           offset_h++;
01231         }
01232         offset_w++;
01233       }
01234       // add scale boxes into all boxes
01235       for (uint k = 0; k < bbtmp.size(); ++k)
01236         bbs.push_back(bbtmp[k]);
01237       scale_index++;
01238     }
01239   }
01240
01241   template <typename T, class Tstate> template <class Tin>
01242   bboxes& detector<T,Tstate>::fprop(idx<Tin> &img, const char *frame_name) {
01243     TIMING1("t1 before prepare");
01244     TIMING2("t2 before prepare");
01245     TIMING_RESIZING_RESET();
01246     // prepare image and resolutions
01247     prepare(img, frame_name);
01248     // do a fprop for each scaled input, based on the 'image' slot prepared
01249     // by prepare().
01250     TIMING2("preparation");
01251     multi_res_fprop();
01252     TIMING2("net fprop");
01253     TIMING1("end of network");
01254     TIMING_RESIZING("total resizing time");
01255     // smooth outputs
01256     smooth_outputs();
01257
01258     if (bboxes_off) // do not extract bboxes if off flag is true
01259       return raw_bboxes;
01260     // clear previous bounding boxes
01261     raw_bboxes.clear();
01262     // get new bboxes
01263     if (answer) extract_bboxes(pre_threshold, raw_bboxes);
01264     // sort bboxes by confidence (most confident first)
01265     raw_bboxes.sort_by_confidence();
01266     TIMING1("extract bboxes");
01267     // non-maximum suppression
01268     fprop_nms(raw_bboxes, pruned_bboxes);
01269     // print results
01270     if (!silent) mout << "found " << pruned_bboxes.pretty(&labels);
01271     // save positive response input windows in save mode
01272     if (save_mode)
01273       save_bboxes(pruned_bboxes, save_dir, frame_name);
01274     // backward connections
01275     back_module<T, Tstate>* back = (back_module<T, Tstate>*)((layers<T,Tstate>&)thenet).find("back");
01276     if (back) {
01277       back->bb(pruned_bboxes);
01278     }
01279     TIMING1("end bboxes");
01280     // return bounding boxes
01281     TIMING2("post proc");
01282     return pruned_bboxes;
01283   }
01284
01285   template <typename T, class Tstate>
01286   void detector<T,Tstate>::fprop_nms(bboxes &in, bboxes &out) {
01287     if (pnms) pnms->fprop(in, out);
01288     else out = in;
01289   }
01290
01291   // bboxes operations /////////////////////////////////////////////////////////
01292
01293   template <typename T, class Tstate>
01294   void detector<T,Tstate>::
01295   save_bboxes(bboxes &boxes, const string &dir, const char *frame_name) {
01296     bboxes bbs = boxes;
01297 #ifdef __NOSTL__
01298     eblerror("save_bboxes not implemented");
01299 #else
01300     ostringstream fname, cmd;
01301     midx<T> inpp;
01302     idx<T> inorig;
01303     vector<bool> dir_exists(labels.size(), false);
01304     string root = dir;
01305     root += "/";
01306     vector<string> dir_pp(labels.size(), root.c_str());
01307     vector<string> dir_orig(labels.size(), root.c_str());
01308
01309     // initialize directory names
01310     for (uint i = 0; i < labels.size(); ++i) {
01311       dir_pp[i] += "preprocessed/";
01312       dir_pp[i] += labels[i];
01313       dir_pp[i] += "/";
01314       dir_orig[i] += "original/";
01315       dir_orig[i] += labels[i];
01316       dir_orig[i] += "/";
01317     }
01318     svector<midx<T> > &pp = get_preprocessed(bbs, save_max_per_frame,
01319                                              diverse_ordering);
01320     // loop on bounding boxes
01321     for (uint i = 0; i < pp.size(); ++i) {
01322       midx<T> &sample = pp[i];
01323       const bbox &bb = bbs[i];
01324       // check if directory exists for this class, otherwise create it
01325       if (!dir_exists[bb.class_id]) {
01326         mkdir_full(dir_pp[bb.class_id]);
01327         mkdir_full(dir_orig[bb.class_id]);
01328         dir_exists[bb.class_id] = true;
01329       }
01331       // preprocessed
01332       // make sure directory exists
01333       fname.str("");
01334       fname << dir_pp[bb.class_id]
01335             << frame_name << "_" << labels[bb.class_id] << setw(3)
01336             << setfill('0') << save_counts[bb.class_id] << MATRIX_EXTENSION;
01337       string d1 = dirname(fname.str().c_str());
01338       mkdir_full(d1);
01339       try {
01340         // save preprocessed image as lush mat
01341         if (save_matrices(sample, fname.str()))
01342           mout << "saved " << fname.str() << ": " << sample << " (confidence "
01343                << bb.confidence << ")" << endl;
01344       } catch(eblexception &e) {};
01345       // ///////////////////////////////////////////////////////////////////////
01346       // // original
01347       // // get bbox of original input
01348       // if (bb.height + bb.h0 > image.dim(1) ||
01349       //          bb.width + bb.w0 > image.dim(2) ||
01350       //          bb.h0 < 0 || bb.w0 < 0)
01351       //        merr << "warning: trying to crop bbox outside of image bounds: bbox "
01352       //             << bb << " in image " << image << endl;
01353       // // make sure we don't try to crop outside of image bounds
01354       // float h = std::max((float)0, bb.h0), w = std::max((float)0, bb.w0);
01355       // float height = std::min((float) image.dim(0) - h, h + bb.height);
01356       // float width = std::min((float) image.dim(1) - w, h + bb.width);
01357       // if (height <= 0 || width <= 0 ||
01358       //          height + h <= 0 || height + h > image.dim(1) ||
01359       //          width + w <= 0 || width + w > image.dim(2)) {
01360       //        merr << "warning: ignoring bbox original save out of bounds ("
01361       //             << h << "," << w << ")" << height << "x" << width << endl;
01362       // } else {
01363       //        inorig = image.narrow(1, (int) height, (int) h);
01364       //        inorig = inorig.narrow(2, (int) width, (int) w);
01365       //        inorig = inorig.shift_dim(0, 2); // put channels back to dimension 2
01366       //        // save original image as png
01367       //        fname.str("");
01368       //        fname << dir_orig[bb.class_id] << frame_name << "_"
01369       //              << labels[bb.class_id] << setw(3) << setfill('0')
01370       //              << save_counts[bb.class_id] << ".png";
01371       //        // make sure directory exists
01372       //        string d2 = dirname(fname.str().c_str());
01373       //        mkdir_full(d2);
01374       //        if (save_image(fname.str(), inorig, "png"))
01375       //          mout << "saved " << fname.str() << ": " << inorig << " (confidence "
01376       //               << bb.confidence << ")" << endl;
01377       // }
01378       // increment file counter
01379       save_counts[bb.class_id]++;
01380     }
01381 #endif
01382   }
01383
01384   template <typename T, class Tstate>
01385   void detector<T,Tstate>::add_class(const char *name) {
01386     if (!name)
01387       eblerror("cannot add empty class name");
01388     mout << "Adding class " << name << endl;
01389     labels.push_back(name);
01390     mout << "New class list is: " << labels << endl;
01391   }
01392
01394   // saving methods
01395
01396   template <typename T, class Tstate>
01397   uint detector<T,Tstate>::get_total_saved() {
01398     uint total = 0;
01399     for (size_t i = 0; i < save_counts.size(); ++i)
01400       total += save_counts[i];
01401     return total;
01402   }
01403
01404   template <typename T, class Tstate>
01405   string& detector<T,Tstate>::set_save(const string &directory, uint nmax,
01406                                        bool diverse) {
01407     save_mode = true;
01408     save_dir = directory;
01409     // save_dir += "_";
01410     // save_dir += tstamp();
01411     diverse_ordering = diverse;
01412     save_max_per_frame = nmax;
01413     mout << "Enabling saving of detected regions into: ";
01414     mout << save_dir << endl;
01415     mout << "Saving at most " << save_max_per_frame << " positive windows"
01416          << (diverse_ordering ? " and ordering them by diversity." : ".")
01417          << endl;
01418     return save_dir;
01419   }
01420
01421   template <typename T, class Tstate>
01422   vector<idx<T> >& detector<T,Tstate>::get_originals() {
01423     if (bodetections) // recompute only if not up-to-date
01424       return odetections;
01425     idx<T> input;
01426     size_t i;
01427     // clear vector
01428     odetections.clear();
01429     // loop on bounding boxes
01430     for (i = 0; i < pruned_bboxes.size(); ++i) {
01431       bbox &bb = pruned_bboxes[i];
01432       // exclude background class
01433       if ((bb.class_id == bgclass) || (bb.class_id == mask_class))
01434         continue ;
01435       // check the box is not out of bounds
01436       if (bb.h0 < 0 || bb.w0 < 0
01437           || bb.h0 + bb.height > image.dim(1)
01438           || bb.w0 + bb.width > image.dim(2)) {
01439         merr << "warning: box " << bb << "is out of bounds in original image "
01440              << image << endl;
01441         continue ;
01442       }
01443       // get bbox of input
01444       input = image.narrow(1, (int) bb.height, (int) bb.h0);
01445       input = input.narrow(2, (int) bb.width, (int) bb.w0);
01446       //input = input.shift_dim(0, 2); // put channels back to dimension 2
01447       odetections.push_back(input);
01448     }
01449     bodetections = true;
01450     return odetections;
01451   }
01452
01453   template <typename T, class Tstate>
01454   midx<T> detector<T,Tstate>::get_preprocessed(const bbox &bb) {
01455     mstate<Tstate> &ins = ppinputs[0];
01456     mstate<Tstate> &outs = outputs[0];
01457     // get bbox of input given output bbox and its offsets
01458     idxdim d(1, 1, 1); //bb.oheight, bb.owidth);
01459     d.setoffset(1, bb.o.h0);
01460     d.setoffset(2, bb.o.w0);
01461     mfidxdim md;
01462     for (uint i = 0; i < outs.size(); ++i) {
01463       if (i == (uint) bb.oscale_index) md.push_back(d);
01464       else md.push_back_empty();
01465     }
01466     mfidxdim d2 = thenet.bprop_size(md);
01467     EDEBUG("get_preprocessed: bprop_size of " << md << " -> " << d2
01468           << " from outputs " << outs << " to input " << ins);
01469     // get bboxes after the resizepp
01470     mfidxdim dims = resizepp->get_msize();
01471     if (dims.size() != ins.size())
01472       eblerror("expected same size dimensions and ins but got "
01473                << dims.size() << " and " << ins.size());
01474     midx<T> all(1);
01475     ins.get_padded_midx(dims, all);
01476     return all;
01477   }
01478
01479   template <typename T, class Tstate>
01480   svector<midx<T> >& detector<T,Tstate>::
01481   get_preprocessed(bboxes &out, uint nmax, bool diverse, uint pre_diverse_max) {
01482     return get_preprocessed(pruned_bboxes, out, nmax, diverse, pre_diverse_max);
01483   }
01484
01485   template <typename T, class Tstate>
01486   svector<midx<T> >& detector<T,Tstate>::
01487   get_preprocessed(bboxes &in, bboxes &out, uint nmax, bool diverse,
01488                    uint pre_diverse_max) {
01489     // if (bppdetections) // recompute only if not up-to-date
01490     //   return ppdetections;
01491     idx<T> input;
01492     size_t i;
01493     size_t n = in.size();
01494     // limit number of samples fed to diversity if enabled
01495     if (diverse && pre_diverse_max > 0) {
01496       if (nmax > 0) pre_diverse_max = std::max(pre_diverse_max, nmax);
01497       n = std::min((size_t) pre_diverse_max, n);
01498     }
01499
01500     // clear vector
01501     ppdetections.clear();
01502     out.clear();
01503     // loop on bounding boxes
01504     for (i = 0; i < n; ++i) {
01505       bbox &bb = in[i];
01506       midx<T> all = get_preprocessed(bb);
01507       ppdetections.push_back_new(all);
01508       // outs.push_back(out);
01509       out.push_back(bb);
01510     }
01511     // diverse ordering
01512     if (diverse) out.sort_by_difference(ppdetections);
01513     // cap to n
01514     if (nmax > 0 && nmax < ppdetections.size()) {
01515       ppdetections.erase(ppdetections.begin() + nmax, ppdetections.end());
01516       out.erase(out.begin() + nmax, out.end());
01517     }
01518     // return
01519     bppdetections = true;
01520     return ppdetections;
01521   }
01522
01523   template <typename T, class Tstate>
01524   idx<T> detector<T,Tstate>::get_mask(string &classname) {
01525     int id = get_class_id(classname);
01526     idxdim d(image.dim(1), image.dim(2));
01527     if (mask.get_idxdim() != d)
01528       mask = idx<T>(d);
01529     if (id == -1) { // class not found
01530       merr << "warning: unknown class " << classname << endl;
01531       idx_clear(mask);
01532       return mask;
01533     }
01534     eblerror("get_mask temporarly broken, outputs is now multi-state");
01535     // // merge all outputs of class 'id' into mask
01536     // for (uint i = 0; i < ppinputs.size(); ++i) {
01537     //   Tstate &ppin = (*ppinputs[i])[0];
01538     //   idx<T> in = ppin.x.select(0, 0);
01539     //   idx<T> out = outputs[i]->x.select(0, id);
01540     //   rect<int> ob = original_bboxes[i];
01541     //   // resizing to inputs, then to original input, to avoid precision loss
01542     //   out = image_resize(out, in.dim(0), in.dim(1), 1);
01543     //   out = out.narrow(0, ob.height, ob.h0);
01544     //   out = out.narrow(1, ob.width, ob.w0);
01545     //   out = image_resize(out, mask.dim(0), mask.dim(1), 1);
01546     //   if (i++ == 0)
01547     //  idx_copy(out, mask);
01548     //   else
01549     //  idx_max(mask, out, mask);
01550     // }
01551     return mask;
01552   }
01553
01555   // processing
01556
01557   template <typename T, class Tstate> template <class Tin>
01558   void detector<T,Tstate>::prepare(idx<Tin> &img, const char *frame_name) {
01559     // tell detections vectors they are not up-to-date anymore
01560     bodetections = false;
01561     bppdetections = false;
01562     // deep copy to cast input into net's type and move channels to 1st dim
01563     if (img.order() == 2) { // 1 channel only
01564       image = idx<T>(1, img.dim(0), img.dim(1));
01565       idx<T> tmp = image.select(0, 0);
01566       idx_copy(img, tmp);
01567     } else if (img.order() >= 3) { // multiple channels
01568       idx<Tin> tmp = img.shift_dim(2, 0);
01569       image = idx<T>(tmp.get_idxdim());
01570       idx_copy(tmp, image);
01571     } else
01572       eblerror("expected at least 2 dimensions in input but got " << img);
01573     // if input size had changed, reinit resolutions
01574     if (!initialized ||
01575         (!(indim == image.get_idxdim()) && restype != NETWORK)) {
01576       init(image.get_idxdim(), frame_name);
01577     }
01578   }
01579
01580   template <typename T, class Tstate>
01581   void detector<T,Tstate>::prepare_scale(uint i) {
01582     if (i >= scales.size())
01583       eblthrow("cannot request scale " << i << ", there are only "
01584                << nscales << " scales");
01585     // select input/outputs buffers
01586     //    output = outputs[0];
01587     if (!mem_optimization || keep_inputs) // we use different bufs for each i
01588       input = &finput;
01589     else
01590       input = minput;
01591     // set resizing of current scale
01592     idxdim d = scales[i];
01593     resizepp->set_dimensions(d.dim(1), d.dim(2));
01594     // // save actual resolutions
01595     // fidxdim tmp = d;
01596     // idxdim actual = thenet.fprop_size(tmp);
01597     // actual = thenet.bprop_size(actual);
01598     // actual_scales[i] = actual;
01599     // EDEBUG("requested resolution " << d << " at scale " << i
01600     //    << ": actual res " << actual);
01601   }
01602
01603   template <typename T, class Tstate>
01604   void detector<T,Tstate>::multi_res_fprop() {
01605     // timing
01606     timer t;
01607     t.start();
01608     for (uint i = 0; i < scales.size(); ++i) {
01609       prepare_scale(i);
01610       input->x = image; // put image in input state
01611       // keep a copy of preprocess' output if displaying
01612       if (!mem_optimization || keep_inputs)
01613         resizepp->set_output_copy(ppinputs[i]);
01614       // fprop
01615       mstate<Tstate> &out = outputs[0];
01616       thenet.fprop(*input, out);
01617       EDEBUG("detector outputs: " << out);
01618       // outputs dumping
01619       if (!outputs_dump.empty()) {
01620         string fname = outputs_dump;
01621         if (out.size() == 1) {
01622           idx<T> &o = out[0].x;
01623           fname << "_" << o << ".mat";
01624           save_matrix(o, fname);
01625           mout << "Saved " << fname << " (" << o << ", min: " << idx_min(o)
01626                << ", max: " << idx_max(o) << ")" << endl;
01627         } else {
01628           // TODO: write code to save multi-state x components
01629         }
01630       }
01631       // memorize original input's bbox in resized input
01632       rect<int> &bbox = original_bboxes[i];
01633       rect<int> bb = resizepp->get_original_bbox();
01634       bbox.h0 = bb.h0;
01635       bbox.w0 = bb.w0;
01636       bbox.height = bb.height;
01637       bbox.width = bb.width;
01638
01639 // #ifdef __DUMP_STATES__
01640 //       DUMP(output->x, "detector_output_");
01641 // #endif
01642
01643       if (optimization_swap) { // swap output and input
01644         eblerror("mem optimization temporarly broken because out is now mstate");
01645         // tmp = input;
01646         // input = output;
01647         // output = tmp;
01648       }
01649     }
01650     if (!silent) mout << "net_processing=" << t.elapsed_ms() << endl;
01651   }
01652
01653 } // end namespace ebl
01654
01655 #endif