libeblearn
|
00001 /*************************************************************************** 00002 * Copyright (C) 2010 by Pierre Sermanet * 00003 * pierre.sermanet@gmail.com * 00004 * All rights reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions are met: 00008 * * Redistributions of source code must retain the above copyright 00009 * notice, this list of conditions and the following disclaimer. 00010 * * Redistributions in binary form must reproduce the above copyright 00011 * notice, this list of conditions and the following disclaimer in the 00012 * documentation and/or other materials provided with the distribution. 00013 * * Redistribution under a license not approved by the Open Source 00014 * Initiative (http://www.opensource.org) must display the 00015 * following acknowledgement in all advertising material: 00016 * This product includes software developed at the Courant 00017 * Institute of Mathematical Sciences (http://cims.nyu.edu). 00018 * * The names of the authors may not be used to endorse or promote products 00019 * derived from this software without specific prior written permission. 00020 * 00021 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED 00022 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00023 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00024 * DISCLAIMED. IN NO EVENT SHALL ThE AUTHORS BE LIABLE FOR ANY 00025 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00026 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00027 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00028 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00029 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 ***************************************************************************/ 00032 00033 #ifndef DETECTOR_HPP 00034 #define DETECTOR_HPP 00035 00036 #include "numerics.h" 00037 00038 #ifndef __NOSTL__ 00039 #include <algorithm> 00040 #include <typeinfo> 00041 #include <iomanip> 00042 #endif 00043 00044 using namespace std; 00045 00046 namespace ebl { 00047 00048 template <typename T, class Tstate> 00049 detector<T,Tstate>:: 00050 detector(module_1_1<T,Tstate> &thenet_, vector<string> &labels_, 00051 answer_module<T,T,T,Tstate> *answer_, 00052 resizepp_module<T,Tstate> *resize, const char *background, 00053 std::ostream &o, std::ostream &e, 00054 bool adapt_scales_) 00055 : thenet(thenet_), resizepp(resize), resizepp_delete(false), 00056 input(NULL), minput(NULL), netdim_fixed(false), 00057 bgclass(-1), mask_class(-1), pnms(NULL), scales_step(0), min_scale(1.0), 00058 max_scale(1.0), restype(ORIGINAL), silent(false), save_mode(false), 00059 save_dir(""), save_counts(labels_.size(), 0), min_size(0), max_size(0), 00060 bodetections(false), bppdetections(false), mem_optimization(false), 00061 optimization_swap(false), keep_inputs(true), hzpad(0), wzpad(0), 00062 mout(o), merr(e), smoothing_type(0), initialized(false), 00063 bboxes_off(false), adapt_scales(adapt_scales_), answer(answer_), 00064 ignore_outsiders(false), corners_inference(0), corners_infered(false), 00065 pre_threshold(0), bbox_decision(0) { 00066 // // make sure the top module is an answer module 00067 // module_1_1<T,Tstate> *last = thenet.last_module(); 00068 // if (!dynamic_cast<answer_module<T,Tstate>*>(last)) 00069 // eblerror("expected last module to be of type answer_module but found: " 00070 // << last->name()); 00071 scaler_mode = false; 00072 if (answer && (dynamic_cast<scaler_answer<T,T,T,Tstate>*>(answer) || 00073 dynamic_cast<scalerclass_answer<T,T,T,Tstate>*>(answer))) 00074 scaler_mode = true; 00075 if (answer) mout << "Using answer module: " << answer->describe() << endl; 00076 // look for resize module in network 00077 if (!resizepp) { 00078 resizepp = arch_find(&thenet, resizepp); 00079 if (resizepp) mout << "Found a resizepp module in network: " 00080 << resizepp->describe() << endl; 00081 else mout << "No resizepp module found in network." << endl; 00082 } 00083 // set default resizing module 00084 if (!resizepp) { 00085 resizepp = new resizepp_module<T,Tstate>; 00086 cout << "Using default resizing module: " << resizepp->describe() << endl; 00087 resizepp_delete = true; 00088 } 00089 labels = labels_; 00090 mout << "Classes labels: " << labels << endl; 00091 // #ifdef __ANDROID__ // TODO: temporary 00092 // bgclass = 0; 00093 // #else 00094 //#endif 00095 // initilizations 00096 save_max_per_frame = limits<uint>::max(); 00097 diverse_ordering = false; 00098 // set outpout streams of network 00099 thenet.set_output_streams(o, e); 00100 update_merge_alignment(); 00101 } 00102 00103 template <typename T, class Tstate> 00104 detector<T,Tstate>::~detector() { 00105 if (resizepp_delete && resizepp) delete resizepp; 00106 if (pnms) delete pnms; 00107 } 00108 00109 template <typename T, class Tstate> 00110 void detector<T,Tstate>::set_scaling_original() { 00111 nscales = 1; 00112 restype = ORIGINAL; 00113 } 00114 00115 template <typename T, class Tstate> 00116 void detector<T,Tstate>::set_scaling_type(t_scaling type) { 00117 restype = type; 00118 mout << "Setting scaling to type " << type << " ("; 00119 switch (restype) { 00120 case MANUAL: mout << "MANUAL"; break ; 00121 case SCALES: mout << "SCALES"; break ; 00122 case NSCALES: mout << "NSCALES"; break ; 00123 case SCALES_STEP: mout << "SCALES_STEP"; break ; 00124 case ORIGINAL: mout << "ORIGINAL"; break ; 00125 case NETWORK: mout << "NETWORK"; break ; 00126 case SCALES_STEP_UP: mout << "SCALES_STEP_UP"; break ; 00127 default: 00128 eblerror("unknown type"); 00129 } 00130 mout << ")" << endl; 00131 } 00132 00133 template <typename T, class Tstate> 00134 void detector<T,Tstate>::set_resolutions(const midxdim &scales_) { 00135 restype = MANUAL; 00136 manual_scales = scales_; 00137 if (manual_scales.size() == 0) 00138 eblerror("expected at least 1 scale but found 0"); 00139 // add the feature dimension for each scale 00140 for (uint i = 0; i < manual_scales.size(); ++i) { 00141 idxdim &d = manual_scales[i]; 00142 d.insert_dim(0, 1); 00143 } 00144 } 00145 00146 template <typename T, class Tstate> 00147 void detector<T,Tstate>::set_resolutions(const vector<double> &factors) { 00148 restype = SCALES; 00149 scale_factors = factors; 00150 } 00151 00152 template <typename T, class Tstate> 00153 void detector<T,Tstate>::set_resolution(double factor) { 00154 restype = SCALES; 00155 scale_factors.clear(); 00156 scale_factors.push_back(factor); 00157 } 00158 00159 template <typename T, class Tstate> 00160 void detector<T,Tstate>::set_resolutions(int nscales_) { 00161 nscales = (uint) nscales_; 00162 restype = NSCALES; 00163 } 00164 00165 template <typename T, class Tstate> 00166 void detector<T,Tstate>::set_resolutions(double scales_step_, 00167 double max_scale_, 00168 double min_scale_) { 00169 restype = SCALES_STEP; 00170 scales_step = scales_step_; 00171 max_scale = max_scale_; 00172 min_scale = min_scale_; 00173 mout << "Multi resolution scales: step factor " << scales_step 00174 << ", min/max resolution factor " << min_scale << ", " << max_scale 00175 << endl; 00176 } 00177 00178 template <typename T, class Tstate> 00179 void detector<T,Tstate>::set_zpads(float hzpad_, float wzpad_) { 00180 if (hzpad_ != 0 || wzpad_ != 0) { 00181 if (!netdim_fixed) { 00182 fidxdim minodim(1, 1, 1); // min output dims 00183 netdim = thenet.bprop_size(minodim); // compute min input dims 00184 } 00185 hzpad = (uint) (hzpad_ * netdim.dim(1)); 00186 wzpad = (uint) (wzpad_ * netdim.dim(2)); 00187 resizepp->set_zpads(hzpad, wzpad); 00188 mout << "Adding zero padding on input (on each side): hpad: " 00189 << hzpad << " wpad: " << wzpad << endl; 00190 if (hzpad_ > 1 || wzpad_ > 1) 00191 eblerror("zero padding coeff should be in [0 1] range"); 00192 } 00193 } 00194 00195 template <typename T, class Tstate> 00196 int detector<T,Tstate>::get_class_id(const string &name) { 00197 for (uint i = 0; i < labels.size(); ++i) 00198 if (!strcmp(labels[i].c_str(), name.c_str())) 00199 return i; 00200 return -1; 00201 } 00202 00203 template <typename T, class Tstate> 00204 void detector<T,Tstate>::set_bgclass(const char *bg) { 00205 string name; 00206 00207 if (bg) 00208 name = bg; 00209 else 00210 name = "bg"; // default name 00211 bgclass = get_class_id(name); 00212 if (bgclass != -1) { 00213 mout << "Background class is \"" << name << "\" with id " << bgclass; 00214 mout << "." << endl; 00215 } else if (bg) 00216 merr << "warning: background class \"" << bg << "\" not found." << endl; 00217 } 00218 00219 // TODO: handle more than 1 class 00220 template <typename T, class Tstate> 00221 bool detector<T,Tstate>::set_mask_class(const char *mask) { 00222 string name; 00223 00224 if (!mask) 00225 return false; 00226 name = mask; 00227 mask_class = get_class_id(name); 00228 if (mask_class != -1) { 00229 mout << "Mask class is \"" << name << "\" with id " << mask_class; 00230 mout << "." << endl; 00231 return true; 00232 } 00233 merr << "warning: mask class \"" << mask << "\" not found." << endl; 00234 return false; 00235 } 00236 00237 template <typename T, class Tstate> 00238 void detector<T,Tstate>::set_silent() { 00239 silent = true; 00240 } 00241 00242 template <typename T, class Tstate> 00243 void detector<T,Tstate>::set_max_resolution(uint max_size_) { 00244 uint mzpad = std::max(hzpad * 2, wzpad * 2); 00245 max_size = max_size_ + mzpad; 00246 mout << "Setting maximum input size to " << max_size_ << "x" 00247 << max_size_ << " (add twice max(hzpad,wzpad): " << mzpad 00248 << ")" << endl; 00249 } 00250 00251 template <typename T, class Tstate> 00252 void detector<T,Tstate>::set_min_resolution(uint min_size_) { 00253 mout << "Setting minimum input size to " << min_size_ << "x" 00254 << min_size_ << "." << endl; 00255 min_size = min_size_; 00256 } 00257 00258 template <typename T, class Tstate> 00259 void detector<T,Tstate>::set_raw_thresholds(vector<float> &t) { 00260 mout << "Using multiple thresholds for raw bbox extractions: " << t << endl; 00261 raw_thresholds = t; 00262 } 00263 00264 template <typename T, class Tstate> 00265 void detector<T,Tstate>:: 00266 set_nms(t_nms type, float pre_threshold_, float post_threshold, 00267 float pre_hfact, float pre_wfact, float post_hfact, float post_wfact, 00268 float woverh, float max_overlap, float max_hcenter_dist, 00269 float max_wcenter_dist, float vote_max_overlap, 00270 float vote_max_hcenter_dist, float vote_max_wcenter_dist) { 00271 pre_threshold = pre_threshold_; 00272 if (pnms) delete pnms; 00273 switch (type) { 00274 case nms_none: break ; // none 00275 case nms_overlap: // traditional overlap only 00276 pnms = new nms 00277 (post_threshold, max_overlap, max_hcenter_dist, max_wcenter_dist, 00278 pre_hfact, pre_wfact, post_hfact, post_wfact, woverh, mout, merr); 00279 break ; 00280 case nms_voting: // voting only 00281 pnms = new voting_nms 00282 (post_threshold, vote_max_overlap, vote_max_hcenter_dist, 00283 vote_max_wcenter_dist, 00284 pre_hfact, pre_wfact, post_hfact, post_wfact, woverh, mout, merr); 00285 break ; 00286 case nms_voting_overlap: // voting + traditional overlap 00287 pnms = new voting_nms 00288 (post_threshold, max_overlap, max_hcenter_dist, max_wcenter_dist, 00289 pre_hfact, pre_wfact, post_hfact, post_wfact, woverh, 00290 vote_max_overlap, vote_max_hcenter_dist, vote_max_wcenter_dist, 00291 mout, merr); 00292 break ; 00293 default: // unknown 00294 eblerror("unknown type of nms " << type); 00295 } 00296 mout << "Non-maximum suppression (nms): " 00297 << (pnms ? pnms->describe() : "none") << endl; 00298 } 00299 00300 template <typename T, class Tstate> 00301 void detector<T,Tstate>::set_scaler_mode(bool set) { 00302 scaler_mode = set; 00303 mout << "Scaler mode is " 00304 << (scaler_mode ? "enabled" : "disabled") << "." << endl; 00305 } 00306 00307 template <typename T, class Tstate> 00308 void detector<T,Tstate>::set_netdim(idxdim &d) { 00309 netdim = d; 00310 netdim.insert_dim(0, 1); 00311 netdim_fixed = true; 00312 mout << "Manually setting network's minimum input to " << d << endl; 00313 } 00314 00315 template <typename T, class Tstate> 00316 void detector<T,Tstate>::set_mem_optimization(Tstate &in, Tstate &out, 00317 bool keep_inputs_) { 00318 eblwarn("mem optimization temporarly broken because out is now mstate"); 00319 // mout << "Optimizing memory usage by using only 2 alternating buffers"; 00320 // mem_optimization = true; 00321 // keep_inputs = keep_inputs_; 00322 // mout << " (and " << (keep_inputs ? "":"not ") 00323 // << "keeping multi-scale inputs)"; 00324 // minput = ∈ 00325 // input = ∈ 00326 // output = &out; 00327 // // remember if we need to swap buffers because of odd operations. 00328 // optimization_swap = !thenet.optimize_fprop(*input, *output); 00329 // mout << endl; 00330 } 00331 00332 template <typename T, class Tstate> 00333 void detector<T,Tstate>::set_outputs_dumping(const char *name) { 00334 outputs_dump = name; 00335 } 00336 00337 template <typename T, class Tstate> 00338 void detector<T,Tstate>::set_bboxes_off() { 00339 bboxes_off = true; 00340 } 00341 00342 template <typename T, class Tstate> 00343 vector<string>& detector<T,Tstate>::get_labels() { 00344 return labels; 00345 } 00346 00347 template <typename T, class Tstate> 00348 void detector<T,Tstate>::set_ignore_outsiders() { 00349 ignore_outsiders = true; 00350 } 00351 00352 template <typename T, class Tstate> 00353 void detector<T,Tstate>::set_corners_inference(uint type) { 00354 mout << "Setting corners inference type to " << type << endl; 00355 corners_inference = type; 00356 } 00357 00358 template <typename T, class Tstate> 00359 void detector<T,Tstate>::set_bbox_decision(uint type) { 00360 bbox_decision = type; 00361 mout << "Setting bbox decision type to " << type << endl; 00362 } 00363 00364 template <typename T, class Tstate> 00365 void detector<T,Tstate>::set_bbox_scalings(mfidxdim &scalings) { 00366 bbox_scalings = scalings; 00367 mout << "Setting bbox scalings to " << bbox_scalings << endl; 00368 } 00369 00371 // initialization 00372 00373 template <typename T, class Tstate> 00374 void detector<T,Tstate>::init(idxdim &dsample, const char *frame_name) { 00375 initialized = true; 00376 indim = dsample; 00377 // the network's minimum input dimensions 00378 if (!netdim_fixed) 00379 netdim = network_mindims(thenet, dsample.order()); 00380 // mout << "Network's minimum input dimensions are: " << netdim 00381 // << thenet.pretty(netdim) << endl; 00382 // minimum input dimensions: factor of network's minimum input 00383 idxdim mindim = netdim * min_scale; 00384 // if (mindim.dim(1) + hzpad * 2 < netdim.dim(1)) 00385 // mindim.setdim(1, netdim.dim(1) - hzpad * 2); 00386 // if (mindim.dim(2) + wzpad * 2 < netdim.dim(2)) 00387 // mindim.setdim(2, netdim.dim(2) - wzpad * 2); 00388 mindim.setdim(0, dsample.dim(0)); // feature dimension is not scaled 00389 // maximum input dimensions: factor of original input 00390 idxdim maxdim = dsample * max_scale; 00391 for (uint i = 1; i < maxdim.order(); ++i) 00392 if (maxdim.dim(i) < netdim.dim(i)) 00393 maxdim.setdim(i, netdim.dim(i)); 00394 maxdim.setdim(0, dsample.dim(0)); // feature dimension is not scaled 00395 // determine scales 00396 compute_scales(scales, netdim, mindim, maxdim, dsample, restype, nscales, 00397 scales_step, frame_name); 00398 // reallocate buffers if number of scales has changed 00399 if (scales.size() != ppinputs.size()) { 00400 EDEBUG("reallocating input and output buffers"); 00401 DEBUGMEM_PRETTY("detector init scales"); 00402 ppinputs.clear(); 00403 outputs.clear(); 00404 actual_scales.clear(); 00405 // allocate buffers 00406 idxdim order(mindim); 00407 order.setdims(1); // minimum dims 00408 for (uint i = 0; i < scales.size(); ++i) { 00409 mstate<Tstate> *ppin = new mstate<Tstate>(); 00410 ppin->push_back(new Tstate(order)); 00411 ppinputs.push_back(ppin); 00412 mstate<Tstate> *ppout = new mstate<Tstate>(); 00413 ppout->push_back(new Tstate(order)); 00414 outputs.push_back(ppout); 00415 } 00416 DEBUGMEM_PRETTY("detector end of init scales"); 00417 // copy ideal scales to actual scales vector (to be modified later) 00418 actual_scales.copy(scales); 00419 } 00420 } 00421 00423 // scaling methods 00424 00425 template <typename T, class Tstate> 00426 void detector<T,Tstate>:: 00427 compute_scales(midxdim &scales, idxdim &netdim, idxdim &mindim, 00428 idxdim &maxdim, idxdim &indim, t_scaling type, uint nscales, 00429 double scales_step, const char *frame_name) { 00430 // fill scales based on scaling type 00431 scales.clear(); 00432 if (!silent) 00433 mout << "Scales: input: " << indim << " min: " << netdim 00434 << " max: " << maxdim << endl 00435 << "Scaling type " << type << ": "; 00436 switch (type) { 00437 case ORIGINAL: 00438 if (!silent) mout << "1 scale only, the image's original scale." << endl; 00439 scales.push_back(indim); 00440 break ; 00441 case MANUAL: 00442 scales = manual_scales; 00443 if (!silent) 00444 mout << "Manual specification of each scale size to: " << scales <<endl; 00445 break ; 00446 case SCALES: 00447 if (!silent) 00448 mout << "Manual specification of each scale factor applied to " 00449 << "original dimensions." << endl; 00450 compute_resolutions(scales, indim, scale_factors); 00451 break ; 00452 case NSCALES: // n scale between min and max resolutions 00453 if (!silent) 00454 mout << nscales << " scales between min (" << netdim 00455 << ") and max (" << maxdim << ") scales." << endl; 00456 compute_resolutions(scales, netdim, maxdim, nscales); 00457 break ; 00458 case SCALES_STEP: // step fixed amount from scale from max down to min 00459 if (!silent) 00460 mout << "Scale step of " << scales_step << " from max (" << maxdim 00461 << ") down to min (" << mindim << ") scale." << endl; 00462 compute_resolutions(scales, mindim, maxdim, scales_step); 00463 break ; 00464 case SCALES_STEP_UP: // step fixed amount from scale min up to max 00465 if (!silent) 00466 mout << "Scale step of " << scales_step << " from min (" << mindim 00467 << ") up to max (" << maxdim << ") scale." << endl; 00468 compute_resolutions_up(scales, indim, mindim, maxdim, scales_step); 00469 break ; 00470 case NETWORK: 00471 if (!silent) 00472 mout << "Resize all inputs to network's minimal size" << endl; 00473 scales.push_back(netdim); 00474 break ; 00475 default: eblerror("unknown scaling mode"); 00476 } 00477 // limit scales with max_size 00478 for (midxdim::iterator i = scales.begin(); i != scales.end(); ) { 00479 idxdim d = *i; 00480 if (max_size > 0 && (d.dim(1) > max_size || d.dim(2) > max_size)) { 00481 scales.erase(i); 00482 mout << "removing scale " << d << " because of max size " << max_size 00483 << endl; 00484 } else i++; 00485 } 00486 // initialize original bboxes to entire image 00487 rect<int> bb(0, 0, indim.dim(1), indim.dim(2)); 00488 for (uint i = 0; i < scales.size(); ++i) 00489 original_bboxes.push_back(bb); 00490 // print scales 00491 mout << "Detection initialized to "; 00492 if (adapt_scales) mout << "(network-adapted scales) "; 00493 if (scales.size() == 0) mout << "0 resolutions." << endl; 00494 else mout << scales.size() << " input resolutions: " << scales; 00495 mout << endl; 00496 if (scales.size() == 0) 00497 eblthrow("0 resolutions to compute in " << frame_name); 00498 } 00499 00500 template <typename T, class Tstate> 00501 void detector<T,Tstate>:: 00502 compute_resolutions(midxdim &scales, 00503 idxdim &mindim, idxdim &maxdim, uint nscales) { 00504 scales.clear(); 00505 if (nscales == 0) 00506 eblerror("expected at least 1 scale but found " << nscales); 00507 // nscales must be less than the minimum pixel distance between min and max 00508 uint max_res = std::min(maxdim.dim(1) - mindim.dim(1), 00509 maxdim.dim(2) - mindim.dim(2)); 00510 if (nscales > max_res) { 00511 merr << "warning: the number of resolutions requested ("; 00512 merr << nscales << ") is more than"; 00513 merr << " the minimum distance between minimum and maximum possible"; 00514 merr << " resolutions. (min: " << mindim << " max: " << maxdim; 00515 if (mindim == maxdim) 00516 nscales = 1; 00517 else 00518 nscales = 2; 00519 merr << ") setting it to " << nscales << endl; 00520 } 00521 // only 1 scale if min == max or if only 1 scale requested. 00522 if ((mindim == maxdim) || (nscales == 1)) 00523 scales.push_back(maxdim); 00524 else if (nscales == 2) { // 2 resolutions: min and max 00525 scales.push_back(mindim); 00526 scales.push_back(maxdim); 00527 } else { // multiple scales: interpolate between min and max 00528 // compute the step factor: x = e^(log(max/min)/(nres-1)) 00529 double fact = MIN(maxdim.dim(1) / (double) mindim.dim(1), 00530 maxdim.dim(2) / (double) mindim.dim(2)); 00531 double step = exp(log(fact)/(nscales - 1)); 00532 double f; 00533 uint i; 00534 for (f = step, i = 1; i < nscales; ++i, f *= step) { 00535 idxdim d = maxdim * (1 / f); 00536 d.setdim(0, maxdim.dim(0)); // do not scale feature dimension 00537 scales.push_back(d); 00538 } 00539 scales.push_back(maxdim); 00540 } 00541 } 00542 00543 template <typename T, class Tstate> 00544 void detector<T,Tstate>:: 00545 compute_resolutions(midxdim &scales, 00546 idxdim &indims, vector<double> &scale_factors) { 00547 scales.clear(); 00548 if (scale_factors.size() == 0) 00549 eblerror("expected at least 1 scale factor but found " 00550 << scale_factors.size()); 00551 // compute scales 00552 for (uint i = 0; i < scale_factors.size(); ++i) { 00553 idxdim d = indims * scale_factors[i]; 00554 d.setdim(0, indims.dim(0)); // do not scale feature dimension 00555 scales.push_back(d); 00556 } 00557 } 00558 00559 template <typename T, class Tstate> 00560 void detector<T,Tstate>:: 00561 compute_resolutions(midxdim &scales, idxdim &mindim, idxdim &maxdim, 00562 double scales_step) { 00563 scales.clear(); 00564 double factor = 1 / scales_step; 00565 // take steps from max scale until reaching min scale 00566 idxdim d = maxdim; 00567 scales.push_back(d); 00568 d = d * factor; 00569 d.setdim(0, maxdim.dim(0)); // do not scale feature dimension 00570 while (d >= mindim) { 00571 scales.push_back(d); 00572 d = d * factor; 00573 d.setdim(0, maxdim.dim(0)); // do not scale feature dimension 00574 } 00575 } 00576 00577 template <typename T, class Tstate> 00578 void detector<T,Tstate>:: 00579 compute_resolutions_up(midxdim &scales, idxdim &indim, idxdim &mindim, 00580 idxdim &maxdim, double scales_step) { 00581 scales.clear(); 00582 double factor = std::max(mindim.dim(1) / (double) indim.dim(1), 00583 mindim.dim(2) / (double) indim.dim(2)); 00584 idxdim d = indim * factor; 00585 d.setdim(0, maxdim.dim(0)); // do not scale feature dimension 00586 while (d <= maxdim) { 00587 d.set_max(mindim); // make sure each dimension is bigger than mindim 00588 scales.push_front_new(d); 00589 factor *= scales_step; 00590 d = indim * factor; 00591 d.setdim(0, maxdim.dim(0)); // do not scale feature dimension 00592 } 00593 } 00594 00596 // outputs smoothing 00597 00598 template <typename T, class Tstate> 00599 void detector<T,Tstate>::set_smoothing(uint type) { 00600 smoothing_type = type; 00601 idx<T> ker; 00602 switch (smoothing_type) { 00603 case 0: mout << "Outputs smoothing disabled." << endl; break ; 00604 case 1: 00605 ker = idx<T>(3, 3); 00606 ker.set(.3, 0, 0); 00607 ker.set(.5, 0, 1); 00608 ker.set(.3, 0, 2); 00609 ker.set(.5, 1, 0); 00610 ker.set(1 , 1, 1); 00611 ker.set(.5, 1, 2); 00612 ker.set(.3, 2, 0); 00613 ker.set(.5, 2, 1); 00614 ker.set(.3, 2, 2); 00615 idx_dotc(ker, (T) (1 / (double) idx_sum(ker)), ker); 00616 smoothing_kernel = ker; 00617 mout << "Smoothing outputs with kernel: " << endl; 00618 smoothing_kernel.printElems(); 00619 break ; 00620 default: 00621 eblerror("Unknown smoothing type " << type); 00622 } 00623 } 00624 00625 template <typename T, class Tstate> 00626 void detector<T,Tstate>::smooth_outputs() { 00627 if (smoothing_type != 0) { 00628 eblerror("smoothing temporarly broken"); 00629 // FIXME! (outputs is no longer a single output) 00630 // uint hpad = (uint) (smoothing_kernel.dim(0) / 2); 00631 // uint wpad = (uint) (smoothing_kernel.dim(1) / 2); 00632 // for (uint i = 0; i < outputs.size(); ++i) { 00633 // idx<T> &outx = outputs[i]->x; 00634 // intg h = outx.dim(1), w = outx.dim(2); 00635 // idx<T> in(h + 2 * hpad, w + 2 * wpad); 00636 // idx<T> inc = in.narrow(0, h, hpad); 00637 // inc = inc.narrow(1, w, wpad); 00638 // idx_clear(in); 00639 // idx_bloop1(out, outx, T) { 00640 // idx_copy(out, inc); 00641 // idx_2dconvol(in, smoothing_kernel, out); 00642 // } 00643 // } 00644 } 00645 } 00646 00647 // template <typename T, class Tstate> 00648 // void detector<T,Tstate>::extract_bboxes(T threshold, bboxes &bbs) { 00649 // bbox::init_instance_id(); // reset unique ids to start from zero. 00650 // // make a list that contains the results 00651 // double original_h = indim.dim(1); 00652 // double original_w = indim.dim(2); 00653 // intg offset_h = 0, offset_w = 0; 00654 // int scale_index = 0; 00655 // for (uint i = 0; i < ppinputs.size(); ++i) { 00656 // bboxes bbtmp; 00657 // // select elements 00658 // Tstate &input = (*(ppinputs[i]))[0]; 00659 // Tstate &output = *(outputs[i]); 00660 // rect<int> &robbox = original_bboxes[i]; 00661 // // sizes 00662 // double in_h = (double) input.x.dim(1); 00663 // double in_w = (double) input.x.dim(2); 00664 // double out_h = (double) output.x.dim(1); 00665 // double out_w = (double) output.x.dim(2); 00666 // double neth = netdim.dim(1); // network's input height 00667 // double netw = netdim.dim(2); // network's input width 00668 // double scalehi = original_h / robbox.height; // input to original 00669 // double scalewi = original_w / robbox.width; // input to original 00670 // int image_h0 = (int) (robbox.h0 * scalehi); 00671 // int image_w0 = (int) (robbox.w0 * scalewi); 00672 // // offset factor in input map 00673 // double offset_h_factor = (in_h - neth) / std::max((double)1, (out_h - 1)); 00674 // double offset_w_factor = (in_w - netw) / std::max((double)1, (out_w - 1)); 00675 // offset_w = 0; 00676 // Tstate out(output.x.get_idxdim()); 00677 // answer.fprop(output, out); 00678 // // loop on width 00679 // idx_eloop1(ro, out.x, T) { 00680 // offset_h = 0; 00681 // // loop on height 00682 // idx_eloop1(roo, ro, T) { 00683 // int classid = (int) roo.get(0); 00684 // float conf = (float) roo.get(1); 00685 // // if ((offset_h == out_h - 1 || (int)(offset_h) % 3 == 0) 00686 // // && (offset_w == out_w - 1 || (int)(offset_w) % 3 == 0)) { 00687 // // if (true) { 00688 // if (conf >= threshold && classid != bgclass) { 00689 // bbox bb; 00690 // bb.class_id = classid; // Class 00691 // bb.confidence = conf; // Confidence 00692 // bb.scale_index = scale_index; // scale index 00693 // // predicted offsets / scale 00694 // float hoff = 0, woff = 0, scale = 1.0; 00695 // if (scaler_mode) { 00696 // scale = (float) roo.gget(2); 00697 // if (roo.dim(0) == 5) { // class,conf,scale,h,w 00698 // hoff = roo.gget(3) * neth; 00699 // woff = roo.gget(4) * neth; 00700 // } 00701 // // cap scale 00702 // scale = std::max(min_scale_pred, std::min(max_scale_pred, scale)); 00703 // scale = 1 / scale; 00704 // } 00705 // EDEBUG(roo.str()); 00706 // // original box in input map 00707 // bb.iheight = (int) in_h; // input h 00708 // bb.iwidth = (int) in_w; // input w 00709 // bb.i0.h0 = (float) (offset_h * offset_h_factor); 00710 // bb.i0.w0 = (float) (offset_w * offset_w_factor); 00711 // bb.i0.height = (float) neth; 00712 // bb.i0.width = (float) netw; 00713 // // output map 00714 // bb.oheight = (int) out_h; // output height 00715 // bb.owidth = (int) out_w; // output width 00716 // bb.o.h0 = offset_h; // answer height in output 00717 // bb.o.w0 = offset_w; // answer height in output 00718 // // bb.o.h0 = 0; 00719 // // bb.o.w0 = 0; 00720 // // bb.o.h0 = out_h - 1; 00721 // // bb.o.w0 = out_w - 1; 00722 // bb.o.height = 1; 00723 // bb.o.width = 1; 00724 00725 // // transformed box in input map 00726 // bb.i.h0 = bb.i0.h0 + hoff; 00727 // bb.i.w0 = bb.i0.w0 + woff; 00728 // bb.i.height = bb.i0.height; 00729 // bb.i.width = bb.i0.width; 00730 // if (scale != 1.0) 00731 // bb.i.scale_centered(scale, scale); 00732 00733 // // infer original location through network 00734 // idxdim d(1, bb.o.height, bb.o.width); 00735 // d.setoffset(1, bb.o.h0); 00736 // d.setoffset(2, bb.o.w0); 00737 // mfidxdim md(d); 00738 // mfidxdim d2 = thenet.bprop_size(md); 00739 // fidxdim loc = d2[0]; 00740 // bb.i.h0 = loc.offset(1); 00741 // bb.i.w0 = loc.offset(2); 00742 // bb.i.height = loc.dim(1); 00743 // bb.i.width = loc.dim(2); 00744 00745 // // add all input boxes 00746 // for (uint q = 0; q < d2.size(); ++q) 00747 // bb.mi.push_back(rect<float>(d2[q].offset(1), d2[q].offset(2), 00748 // d2[q].dim(1), d2[q].dim(2))); 00749 00750 // // bb.h0 = loc.offset(1) * scalehi; 00751 // // bb.w0 = loc.offset(2) * scalewi; 00752 // // bb.height = loc.dim(1) * scalehi; 00753 // // bb.width = loc.dim(2) * scalewi; 00754 00755 00756 // // original image 00757 // // bbox's rectangle in original image 00758 // // bb.h0 = bb.i.h0 * scalehi; 00759 // // bb.w0 = bb.i.w0 * scalewi; 00760 // bb.h0 = bb.i.h0 * scalehi - image_h0; 00761 // bb.w0 = bb.i.w0 * scalewi - image_w0; 00762 // bb.height = bb.i.height * scalehi; 00763 // bb.width = bb.i.width * scalewi; 00764 // // push bbox to list 00765 // bbtmp.push_back(new bbox(bb)); 00766 // } 00767 // offset_h++; 00768 // } 00769 // offset_w++; 00770 // } 00771 // // add scale boxes into all boxes 00772 // for (uint k = 0; k < bbtmp.size(); ++k) 00773 // bbs.push_back(bbtmp[k]); 00774 // scale_index++; 00775 // } 00776 // } 00777 00778 template <typename T, class Tstate> 00779 void detector<T,Tstate>::update_merge_alignment() { 00780 // check presence of merging module 00781 flat_merge_module<T,Tstate> *merger = NULL; 00782 vector<flat_merge_module<T,Tstate>*> mergers = 00783 arch_find_all(&thenet, merger); 00784 if (mergers.size() > 0) { 00785 mout << "Found merging module(s) in network: " << mergers << endl; 00786 for (uint i = 0; i < mergers.size(); ++i) 00787 mout << mergers[i]->describe()<< endl; 00788 } else { 00789 mout << "No merging module found in network." << endl; 00790 return ; 00791 } 00792 // align for each merger module 00793 for (uint i = 0; i < mergers.size(); ++i) { 00794 merger = mergers[i]; 00795 // get the network narrowed up to the merger module (included) 00796 module_1_1<T,Tstate> *merger_net_included = arch_narrow(&thenet, merger); 00797 module_1_1<T,Tstate> *merger_net = arch_narrow(&thenet, merger, false); 00798 if (!merger_net || !merger_net_included) 00799 eblerror("failed to narrow network up to " << merger); 00800 EDEBUG("network narrowed up to merger module: " << merger->name()); 00801 mout << "Aligning merging centers on top left image origin." << endl; 00802 // for (uint i = 0; i < merger->get_ninputs(); ++i) { 00803 fidxdim c(1, 1, 1), f(1, 1, 1), c0, c1; 00804 mfidxdim m(c), m0, m0m, m1, paddings; //(merger->get_ninputs()); 00805 // determine input size and location of output pixel (0,0) 00806 mfidxdim mf(f); 00807 mf = resizepp->fprop_size(mf); 00808 merger_net_included->fprop_size(mf); 00809 m0m = merger->bprop_size(m); 00810 EDEBUG(merger_net->name() << " m0m: " << m0m); 00811 mfidxdim scales = merger->get_scales(); 00812 // EDEBUG("strides: " << strides); 00813 vector<vector<int> > alloff; 00814 mfidxdim allstrides; 00815 float hs0 = 1, ws0 = 1; 00816 for (uint k = 0; k < m0m.size(); ++k) { 00817 //uint i = k - (k % 2); 00818 uint i = k; 00819 mfidxdim mm(m0m.size()); 00820 mm.set_new(m0m[i], i); 00821 EDEBUG("mm: " << mm); 00822 // determine input size and location of output pixel (0,0) 00823 //merger_net_included->fprop_size(mf); 00824 m0 = merger_net->bprop_size(mm); 00825 // m0 = resizepp->bprop_size(m0); 00826 m0.remove_empty(); 00827 // determine input size and location of output pixel (1,1) 00828 mm[i].setoffset(1, 1); 00829 mm[i].setoffset(2, 1); 00830 //merger_net_included->fprop_size(mf); 00831 m1 = merger_net->bprop_size(mm); 00832 // m1 = resizepp->bprop_size(m1); 00833 m1.remove_empty(); 00834 EDEBUG("m0: " << m0); 00835 EDEBUG("m1: " << m1); 00836 00837 00838 // uint fact = (uint) ceil(strides.size() / (float) m0.size()); 00839 // c0 = m0[i / fact]; 00840 // c1 = m1[i / fact]; 00841 c0 = m0[0]; 00842 c1 = m1[0]; 00843 //fidxdim &stride = strides[i]; 00844 // determine center of output pixel (0,0) in input space 00845 rect<float> p0(c0.offset(1), c0.offset(2), c0.dim(1), c0.dim(2)); 00846 float hc = p0.hcenter(), wc = p0.wcenter(); 00847 00848 // // determine input pixel (0,0) in output space 00849 // fidxdim i0(1, 1) 00850 00851 00852 // if (hc < 0) { 00853 // eblwarn("expected center's height to be >= 0 but got " << hc); 00854 // hc = 1; 00855 // } 00856 // if (wc < 0) { 00857 // eblwarn("expected center's width to be >= 0 but got " << wc); 00858 // wc = 1; 00859 // } 00860 // determine stride of output space in input space 00861 float hs = (c1.offset(1) - c0.offset(1));// / scales[i].dim(0); 00862 float ws = (c1.offset(2) - c0.offset(2));// / scales[i].dim(0); 00863 // if (k == 0) { 00864 // hs = hs0 / (scales[i].dim(0)); 00865 // ws = ws0 / (scales[i].dim(0)); 00866 // hs0 = hs; 00867 // ws0 = ws; 00868 // } else { 00869 // hs = hs0 / (scales[i].dim(0) * hs); 00870 // ws = ws0 / (scales[i].dim(0) * ws); 00871 // } 00872 00873 if (k == 0) { 00874 hs0 = hs; 00875 ws0 = ws; 00876 } 00877 float hos = hs0 / (scales[i].dim(0) * hs); 00878 float wos = ws0 / (scales[i].dim(0) * ws); 00879 fidxdim fi(hos, wos); 00880 allstrides.push_back_new(fi); 00881 00882 // set paddings of merger 00883 //fidxdim pads(hc * hs, wc * ws, hc * hs, wc * ws); 00884 00885 vector<int> offs; 00886 00887 offs.push_back((int)(hc/hs)); 00888 offs.push_back((int)(wc/ws)); 00889 offs.push_back((int)(hc/hs)); 00890 offs.push_back((int)(wc/ws)); 00891 00892 // offs.push_back((int)(hc*hos*scales[i].dim(0))); 00893 // offs.push_back((int)(wc*wos*scales[i].dim(0))); 00894 // offs.push_back((int)(hc*hos*scales[i].dim(0))); 00895 // offs.push_back((int)(wc*wos*scales[i].dim(0))); 00896 00897 // offs.push_back((int)(hc*hos/hs0)); 00898 // offs.push_back((int)(wc*wos/ws0)); 00899 // offs.push_back((int)(hc*hos/hs0)); 00900 // offs.push_back((int)(wc*wos/ws0)); 00901 alloff.push_back(offs); 00902 // fidxdim pads(stride.dim(0) * hc / hs, stride.dim(1) * wc / ws, 00903 // stride.dim(0) * hc / hs, stride.dim(1) * wc / ws); 00904 // fidxdim pads(stride.dim(0) * hc / hs, stride.dim(1) * wc / ws, 0, 0); 00905 // paddings.push_back_new(pads); 00906 mout << merger->name() << "'s input " << i << " must be padded/narrowed with " 00907 << offs << " to recenter " << p0 << " (center " << hc << "x" << wc 00908 << "), (output stride is " << hs << "x" << ws << ")" << std::endl; 00909 } 00910 merger->set_offsets(alloff); 00911 merger->set_strides(allstrides); 00912 } 00913 } 00914 00915 template <typename T, class Tstate> 00916 void detector<T,Tstate>::get_corners(mstate<Tstate> &outputs) { 00917 if (!corners_infered) { 00918 if (corners_inference == 0 || corners_inference == 1) { // infer from net 00919 uint n = 0; 00920 scale_indices.clear(); 00921 for (typename mstate<Tstate>::iterator o = outputs.begin(); 00922 o != outputs.end(); ++o) { 00923 fidxdim d(o->x.get_idxdim()); 00924 fidxdim c(1, 1, 1), mc0; 00925 mfidxdim mc(outputs.size()); 00926 mc.set_new(c, n); 00927 mfidxdim m; 00928 // top left 00929 m = thenet.bprop_size(mc); 00930 m.remove_empty(); 00931 mc0 = m[0]; 00932 itl.push_back_new(mc0); 00933 m = resizepp->get_msize(); 00934 // infer scale index for this output 00935 for (uint i = 0; i < m.size(); ++i) 00936 if (m.exists(i)) { 00937 scale_indices.push_back(i); 00938 break ; 00939 } 00940 m.remove_empty(); 00941 mc0 = m[0]; 00942 pptl.push_back_new(mc0); 00943 // top right 00944 mc[n].setoffset(2, d.dim(2)); 00945 m = thenet.bprop_size(mc); 00946 m.remove_empty(); 00947 mc0 = m[0]; 00948 itr.push_back_new(mc0); 00949 m = resizepp->get_msize(); 00950 m.remove_empty(); 00951 mc0 = m[0]; 00952 pptr.push_back_new(mc0); 00953 // bottom left 00954 mc[n].setoffset(1, d.dim(1)); 00955 mc[n].setoffset(2, 0); 00956 m = thenet.bprop_size(mc); 00957 m.remove_empty(); 00958 mc0 = m[0]; 00959 ibl.push_back_new(mc0); 00960 m = resizepp->get_msize(); 00961 m.remove_empty(); 00962 mc0 = m[0]; 00963 ppbl.push_back_new(mc0); 00964 // bottom right 00965 mc[n].setoffset(1, d.dim(1)); 00966 mc[n].setoffset(2, d.dim(2)); 00967 m = thenet.bprop_size(mc); 00968 m.remove_empty(); 00969 mc0 = m[0]; 00970 ibr.push_back_new(mc0); 00971 m = resizepp->get_msize(); 00972 m.remove_empty(); 00973 mc0 = m[0]; 00974 ppbr.push_back_new(mc0); 00975 ++n; 00976 } 00977 EDEBUG("top left output " << itl); 00978 EDEBUG("top right output " << itr); 00979 EDEBUG("bottom left output " << ibl); 00980 EDEBUG("bottom right output " << ibr); 00981 00982 if (corners_inference == 1) { // from net + save corners 00983 // save corners to matrix 00984 idx<float> scorners(itl.size(), 4, 4); 00985 for (uint i = 0; i < itl.size(); ++i) { 00986 scorners.set(itl[i].offset(1), i, 0, 0); 00987 scorners.set(itl[i].offset(2), i, 0, 1); 00988 scorners.set(itl[i].dim(1), i, 0, 2); 00989 scorners.set(itl[i].dim(2), i, 0, 3); 00990 scorners.set(itr[i].offset(1), i, 1, 0); 00991 scorners.set(itr[i].offset(2), i, 1, 1); 00992 scorners.set(itr[i].dim(1), i, 1, 2); 00993 scorners.set(itr[i].dim(2), i, 1, 3); 00994 scorners.set(ibl[i].offset(1), i, 2, 0); 00995 scorners.set(ibl[i].offset(2), i, 2, 1); 00996 scorners.set(ibl[i].dim(1), i, 2, 2); 00997 scorners.set(ibl[i].dim(2), i, 2, 3); 00998 scorners.set(ibr[i].offset(1), i, 3, 0); 00999 scorners.set(ibr[i].offset(2), i, 3, 1); 01000 scorners.set(ibr[i].dim(1), i, 3, 2); 01001 scorners.set(ibr[i].dim(2), i, 3, 3); 01002 } 01003 save_matrix(scorners, "corners.mat"); 01004 } 01005 corners_infered = true; 01006 } else if (corners_inference == 2) { // load corners 01007 // load corners from matrix 01008 idx<float> corners = load_matrix<float>("corners.mat"); 01009 itl.clear(); itr.clear(); ibl.clear(); ibr.clear(); 01010 for (uint i = 0; i < corners.dim(0); ++i) { 01011 // allocate 01012 fidxdim d(outputs[0].x.get_idxdim()); 01013 d.setdims(1); 01014 itl.push_back_new(d); 01015 itr.push_back_new(d); 01016 ibl.push_back_new(d); 01017 ibr.push_back_new(d); 01018 // set 01019 itl[i].setoffset(1, corners.get(i, 0, 0)); 01020 itl[i].setoffset(2, corners.get(i, 0, 1)); 01021 itl[i].setdim(1, corners.get(i, 0, 2)); 01022 itl[i].setdim(2, corners.get(i, 0, 3)); 01023 itr[i].setoffset(1, corners.get(i, 1, 0)); 01024 itr[i].setoffset(2, corners.get(i, 1, 1)); 01025 itr[i].setdim(1, corners.get(i, 1, 2)); 01026 itr[i].setdim(2, corners.get(i, 1, 3)); 01027 ibl[i].setoffset(1, corners.get(i, 2, 0)); 01028 ibl[i].setoffset(2, corners.get(i, 2, 1)); 01029 ibl[i].setdim(1, corners.get(i, 2, 2)); 01030 ibl[i].setdim(2, corners.get(i, 2, 3)); 01031 ibr[i].setoffset(1, corners.get(i, 3, 0)); 01032 ibr[i].setoffset(2, corners.get(i, 3, 1)); 01033 ibr[i].setdim(1, corners.get(i, 3, 2)); 01034 ibr[i].setdim(2, corners.get(i, 3, 3)); 01035 } 01036 corners_infered = true; 01037 } 01038 } 01039 } 01040 01041 template <typename T, class Tstate> 01042 void detector<T,Tstate>::extract_bboxes(T threshold, bboxes &bbs) { 01043 bbox::init_instance_id(); // reset unique ids to start from zero. 01044 // make a list that contains the results 01045 double original_h = indim.dim(1); 01046 double original_w = indim.dim(2); 01047 intg offset_h = 0, offset_w = 0; 01048 int scale_index = 0; 01049 // get 4 corners coordinates for each scale 01050 mstate<Tstate> &oo = outputs[0]; 01051 answers.clear(); 01052 get_corners(oo); 01053 01054 // loop on output 01055 for (uint o = 0; o < oo.size(); ++o) { 01056 if (o < raw_thresholds.size()) threshold = raw_thresholds[o]; 01057 float thresh = threshold; 01058 // Tstate &input = ppinputs[0][0]; 01059 Tstate &output = oo[o]; 01060 idx<T> outx = output.x; 01061 fidxdim &tl = itl[o], &tr = itr[o], &bl = ibl[o]; 01062 fidxdim &ptl = pptl[o], &ptr = pptr[o], &pbl = ppbl[o]; 01063 // fidxdim &br = ibr[o]; 01064 01065 // steps in input space 01066 double hf = (bl.offset(1) - tl.offset(1)) / outx.dim(1); 01067 double wf = (tr.offset(2) - tl.offset(2)) / outx.dim(2); 01068 // steps in preprocessed space 01069 double phf = (pbl.offset(1) - ptl.offset(1)) / outx.dim(1); 01070 double pwf = (ptr.offset(2) - ptl.offset(2)) / outx.dim(2); 01071 01072 // box scalings 01073 double hscaling = 1.0, wscaling = 1.0; 01074 if (o < bbox_scalings.size()) { 01075 fidxdim &scaling = bbox_scalings[o]; 01076 hscaling = scaling.dim(0); 01077 wscaling = scaling.dim(1); 01078 } 01079 01080 bboxes bbtmp; 01081 // select elements 01082 // rect<int> &robbox = original_bboxes[0]; 01083 // sizes 01084 // double in_h = (double) input.x.dim(1); 01085 // double in_w = (double) input.x.dim(2); 01086 // double out_h = (double) output.x.dim(1); 01087 // double out_w = (double) output.x.dim(2); 01088 // double neth = netdim.dim(1); // network's input height 01089 // double netw = netdim.dim(2); // network's input width 01090 // double scalehi = original_h / robbox.height; // input to original 01091 // double scalewi = original_w / robbox.width; // input to original 01092 // int image_h0 = (int) (robbox.h0 * scalehi); 01093 // int image_w0 = (int) (robbox.w0 * scalewi); 01094 // offset factor in input map 01095 // double offset_h_factor = (in_h - neth) / std::max((double)1, (out_h - 1)); 01096 // double offset_w_factor = (in_w - netw) / std::max((double)1, (out_w - 1)); 01097 offset_w = 0; 01098 Tstate out(outx.get_idxdim()); 01099 answer->fprop(output, out); 01100 answers.push_back_new(out); 01101 01102 idx<T> tmp = outx.select(0, 1); 01103 cout << "out " << o << " threshold " << thresh << " min " << idx_min(tmp) 01104 << " max " << idx_max(tmp) << endl; 01105 01106 // loop on width 01107 idx_eloop1(ro, out.x, T) { 01108 offset_h = 0; 01109 // loop on height 01110 idx_eloop1(roo, ro, T) { 01111 int classid = (int) roo.get(0); 01112 float conf = (float) roo.get(1); 01113 bool accept = false; 01114 // select decision criterion 01115 switch (bbox_decision) { 01116 case 0: accept = (conf >= thresh && classid != bgclass); break ; 01117 case 1: accept = ((offset_h == outx.dim(1) - 1 && offset_w == 0) || 01118 (offset_h == 0 && offset_w == 0) || 01119 (offset_h == outx.dim(1) - 1 01120 && offset_w == outx.dim(2) - 1) || 01121 (offset_h == 0 && offset_w == outx.dim(2) - 0)); 01122 break; 01123 case 2: accept = ((offset_h == outx.dim(1) - 1 01124 && offset_w == outx.dim(2) - 1)); 01125 break; 01126 default: eblerror("unknown bbox decision type"); 01127 } 01128 if (accept) { 01129 bbox bb; 01130 bb.class_id = classid; // Class 01131 bb.confidence = conf; // Confidence 01132 bb.iscale_index = scale_indices[scale_index]; // scale index 01133 bb.oscale_index = scale_index; // scale index 01134 01135 bb.h0 = tl.offset(1) + offset_h * hf; 01136 bb.w0 = tl.offset(2) + offset_w * wf; 01137 bb.height = tl.dim(1); 01138 bb.width = tl.dim(2); 01139 bb.scale_centered(hscaling, wscaling); 01140 01141 bb.i.h0 = ptl.offset(1) + offset_h * phf; 01142 bb.i.w0 = ptl.offset(2) + offset_w * pwf; 01143 bb.i.height = ptl.dim(1); 01144 bb.i.width = ptl.dim(2); 01145 01146 // // predicted offsets / scale 01147 // float hoff = 0, woff = 0, scale = 1.0; 01148 // if (scaler_mode) { 01149 // scale = (float) roo.gget(2); 01150 // if (roo.dim(0) == 5) { // class,conf,scale,h,w 01151 // hoff = roo.gget(3) * neth; 01152 // woff = roo.gget(4) * neth; 01153 // } 01154 // // cap scale 01155 // scale = std::max(min_scale_pred, std::min(max_scale_pred, scale)); 01156 // scale = 1 / scale; 01157 // } 01158 // EDEBUG(roo.str()); 01159 // // original box in input map 01160 // bb.iheight = (int) in_h; // input h 01161 // bb.iwidth = (int) in_w; // input w 01162 // bb.i0.h0 = (float) (offset_h * offset_h_factor); 01163 // bb.i0.w0 = (float) (offset_w * offset_w_factor); 01164 // bb.i0.height = (float) neth; 01165 // bb.i0.width = (float) netw; 01166 // output map 01167 // bb.oheight = (int) out_h; // output height 01168 // bb.owidth = (int) out_w; // output width 01169 bb.o.h0 = offset_h; // answer height in output 01170 bb.o.w0 = offset_w; // answer height in output 01171 bb.o.height = 1; 01172 bb.o.width = 1; 01173 // // bb.o.h0 = 0; 01174 // // bb.o.w0 = 0; 01175 // // bb.o.h0 = out_h - 1; 01176 // // bb.o.w0 = out_w - 1; 01177 01178 // // transformed box in input map 01179 // bb.i.h0 = bb.i0.h0 + hoff; 01180 // bb.i.w0 = bb.i0.w0 + woff; 01181 // bb.i.height = bb.i0.height; 01182 // bb.i.width = bb.i0.width; 01183 // if (scale != 1.0) 01184 // bb.i.scale_centered(scale, scale); 01185 01186 // // infer original location through network 01187 // idxdim d(1, bb.o.height, bb.o.width); 01188 // d.setoffset(1, bb.o.h0); 01189 // d.setoffset(2, bb.o.w0); 01190 // mfidxdim md(d); 01191 // mfidxdim d2 = thenet.bprop_size(md); 01192 // fidxdim loc = d2[0]; 01193 // bb.i.h0 = loc.offset(1); 01194 // bb.i.w0 = loc.offset(2); 01195 // bb.i.height = loc.dim(1); 01196 // bb.i.width = loc.dim(2); 01197 01198 // // add all input boxes 01199 // for (uint q = 0; q < d2.size(); ++q) 01200 // bb.mi.push_back(rect<float>(d2[q].offset(1), d2[q].offset(2), 01201 // d2[q].dim(1), d2[q].dim(2))); 01202 01203 // // bb.h0 = loc.offset(1) * scalehi; 01204 // // bb.w0 = loc.offset(2) * scalewi; 01205 // // bb.height = loc.dim(1) * scalehi; 01206 // // bb.width = loc.dim(2) * scalewi; 01207 01208 01209 // // original image 01210 // // bbox's rectangle in original image 01211 // // bb.h0 = bb.i.h0 * scalehi; 01212 // // bb.w0 = bb.i.w0 * scalewi; 01213 // bb.h0 = bb.i.h0 * scalehi - image_h0; 01214 // bb.w0 = bb.i.w0 * scalewi - image_w0; 01215 // bb.height = bb.i.height * scalehi; 01216 // bb.width = bb.i.width * scalewi; 01217 01218 bool ignore = false; 01219 if (ignore_outsiders) { // ignore boxes that overlap outside 01220 if (bb.h0 < 0 || bb.w0 < 0 01221 || bb.h0 + bb.height > original_h 01222 || bb.w0 + bb.width > original_w) 01223 ignore = true; 01224 } 01225 01226 // push bbox to list 01227 if (!ignore) 01228 bbtmp.push_back(new bbox(bb)); 01229 } 01230 offset_h++; 01231 } 01232 offset_w++; 01233 } 01234 // add scale boxes into all boxes 01235 for (uint k = 0; k < bbtmp.size(); ++k) 01236 bbs.push_back(bbtmp[k]); 01237 scale_index++; 01238 } 01239 } 01240 01241 template <typename T, class Tstate> template <class Tin> 01242 bboxes& detector<T,Tstate>::fprop(idx<Tin> &img, const char *frame_name) { 01243 TIMING1("t1 before prepare"); 01244 TIMING2("t2 before prepare"); 01245 TIMING_RESIZING_RESET(); 01246 // prepare image and resolutions 01247 prepare(img, frame_name); 01248 // do a fprop for each scaled input, based on the 'image' slot prepared 01249 // by prepare(). 01250 TIMING2("preparation"); 01251 multi_res_fprop(); 01252 TIMING2("net fprop"); 01253 TIMING1("end of network"); 01254 TIMING_RESIZING("total resizing time"); 01255 // smooth outputs 01256 smooth_outputs(); 01257 01258 if (bboxes_off) // do not extract bboxes if off flag is true 01259 return raw_bboxes; 01260 // clear previous bounding boxes 01261 raw_bboxes.clear(); 01262 // get new bboxes 01263 if (answer) extract_bboxes(pre_threshold, raw_bboxes); 01264 // sort bboxes by confidence (most confident first) 01265 raw_bboxes.sort_by_confidence(); 01266 TIMING1("extract bboxes"); 01267 // non-maximum suppression 01268 fprop_nms(raw_bboxes, pruned_bboxes); 01269 // print results 01270 if (!silent) mout << "found " << pruned_bboxes.pretty(&labels); 01271 // save positive response input windows in save mode 01272 if (save_mode) 01273 save_bboxes(pruned_bboxes, save_dir, frame_name); 01274 // backward connections 01275 back_module<T, Tstate>* back = (back_module<T, Tstate>*)((layers<T,Tstate>&)thenet).find("back"); 01276 if (back) { 01277 back->bb(pruned_bboxes); 01278 } 01279 TIMING1("end bboxes"); 01280 // return bounding boxes 01281 TIMING2("post proc"); 01282 return pruned_bboxes; 01283 } 01284 01285 template <typename T, class Tstate> 01286 void detector<T,Tstate>::fprop_nms(bboxes &in, bboxes &out) { 01287 if (pnms) pnms->fprop(in, out); 01288 else out = in; 01289 } 01290 01291 // bboxes operations ///////////////////////////////////////////////////////// 01292 01293 template <typename T, class Tstate> 01294 void detector<T,Tstate>:: 01295 save_bboxes(bboxes &boxes, const string &dir, const char *frame_name) { 01296 bboxes bbs = boxes; 01297 #ifdef __NOSTL__ 01298 eblerror("save_bboxes not implemented"); 01299 #else 01300 ostringstream fname, cmd; 01301 midx<T> inpp; 01302 idx<T> inorig; 01303 vector<bool> dir_exists(labels.size(), false); 01304 string root = dir; 01305 root += "/"; 01306 vector<string> dir_pp(labels.size(), root.c_str()); 01307 vector<string> dir_orig(labels.size(), root.c_str()); 01308 01309 // initialize directory names 01310 for (uint i = 0; i < labels.size(); ++i) { 01311 dir_pp[i] += "preprocessed/"; 01312 dir_pp[i] += labels[i]; 01313 dir_pp[i] += "/"; 01314 dir_orig[i] += "original/"; 01315 dir_orig[i] += labels[i]; 01316 dir_orig[i] += "/"; 01317 } 01318 svector<midx<T> > &pp = get_preprocessed(bbs, save_max_per_frame, 01319 diverse_ordering); 01320 // loop on bounding boxes 01321 for (uint i = 0; i < pp.size(); ++i) { 01322 midx<T> &sample = pp[i]; 01323 const bbox &bb = bbs[i]; 01324 // check if directory exists for this class, otherwise create it 01325 if (!dir_exists[bb.class_id]) { 01326 mkdir_full(dir_pp[bb.class_id]); 01327 mkdir_full(dir_orig[bb.class_id]); 01328 dir_exists[bb.class_id] = true; 01329 } 01331 // preprocessed 01332 // make sure directory exists 01333 fname.str(""); 01334 fname << dir_pp[bb.class_id] 01335 << frame_name << "_" << labels[bb.class_id] << setw(3) 01336 << setfill('0') << save_counts[bb.class_id] << MATRIX_EXTENSION; 01337 string d1 = dirname(fname.str().c_str()); 01338 mkdir_full(d1); 01339 try { 01340 // save preprocessed image as lush mat 01341 if (save_matrices(sample, fname.str())) 01342 mout << "saved " << fname.str() << ": " << sample << " (confidence " 01343 << bb.confidence << ")" << endl; 01344 } catch(eblexception &e) {}; 01345 // /////////////////////////////////////////////////////////////////////// 01346 // // original 01347 // // get bbox of original input 01348 // if (bb.height + bb.h0 > image.dim(1) || 01349 // bb.width + bb.w0 > image.dim(2) || 01350 // bb.h0 < 0 || bb.w0 < 0) 01351 // merr << "warning: trying to crop bbox outside of image bounds: bbox " 01352 // << bb << " in image " << image << endl; 01353 // // make sure we don't try to crop outside of image bounds 01354 // float h = std::max((float)0, bb.h0), w = std::max((float)0, bb.w0); 01355 // float height = std::min((float) image.dim(0) - h, h + bb.height); 01356 // float width = std::min((float) image.dim(1) - w, h + bb.width); 01357 // if (height <= 0 || width <= 0 || 01358 // height + h <= 0 || height + h > image.dim(1) || 01359 // width + w <= 0 || width + w > image.dim(2)) { 01360 // merr << "warning: ignoring bbox original save out of bounds (" 01361 // << h << "," << w << ")" << height << "x" << width << endl; 01362 // } else { 01363 // inorig = image.narrow(1, (int) height, (int) h); 01364 // inorig = inorig.narrow(2, (int) width, (int) w); 01365 // inorig = inorig.shift_dim(0, 2); // put channels back to dimension 2 01366 // // save original image as png 01367 // fname.str(""); 01368 // fname << dir_orig[bb.class_id] << frame_name << "_" 01369 // << labels[bb.class_id] << setw(3) << setfill('0') 01370 // << save_counts[bb.class_id] << ".png"; 01371 // // make sure directory exists 01372 // string d2 = dirname(fname.str().c_str()); 01373 // mkdir_full(d2); 01374 // if (save_image(fname.str(), inorig, "png")) 01375 // mout << "saved " << fname.str() << ": " << inorig << " (confidence " 01376 // << bb.confidence << ")" << endl; 01377 // } 01378 // increment file counter 01379 save_counts[bb.class_id]++; 01380 } 01381 #endif 01382 } 01383 01384 template <typename T, class Tstate> 01385 void detector<T,Tstate>::add_class(const char *name) { 01386 if (!name) 01387 eblerror("cannot add empty class name"); 01388 mout << "Adding class " << name << endl; 01389 labels.push_back(name); 01390 mout << "New class list is: " << labels << endl; 01391 } 01392 01394 // saving methods 01395 01396 template <typename T, class Tstate> 01397 uint detector<T,Tstate>::get_total_saved() { 01398 uint total = 0; 01399 for (size_t i = 0; i < save_counts.size(); ++i) 01400 total += save_counts[i]; 01401 return total; 01402 } 01403 01404 template <typename T, class Tstate> 01405 string& detector<T,Tstate>::set_save(const string &directory, uint nmax, 01406 bool diverse) { 01407 save_mode = true; 01408 save_dir = directory; 01409 // save_dir += "_"; 01410 // save_dir += tstamp(); 01411 diverse_ordering = diverse; 01412 save_max_per_frame = nmax; 01413 mout << "Enabling saving of detected regions into: "; 01414 mout << save_dir << endl; 01415 mout << "Saving at most " << save_max_per_frame << " positive windows" 01416 << (diverse_ordering ? " and ordering them by diversity." : ".") 01417 << endl; 01418 return save_dir; 01419 } 01420 01421 template <typename T, class Tstate> 01422 vector<idx<T> >& detector<T,Tstate>::get_originals() { 01423 if (bodetections) // recompute only if not up-to-date 01424 return odetections; 01425 idx<T> input; 01426 size_t i; 01427 // clear vector 01428 odetections.clear(); 01429 // loop on bounding boxes 01430 for (i = 0; i < pruned_bboxes.size(); ++i) { 01431 bbox &bb = pruned_bboxes[i]; 01432 // exclude background class 01433 if ((bb.class_id == bgclass) || (bb.class_id == mask_class)) 01434 continue ; 01435 // check the box is not out of bounds 01436 if (bb.h0 < 0 || bb.w0 < 0 01437 || bb.h0 + bb.height > image.dim(1) 01438 || bb.w0 + bb.width > image.dim(2)) { 01439 merr << "warning: box " << bb << "is out of bounds in original image " 01440 << image << endl; 01441 continue ; 01442 } 01443 // get bbox of input 01444 input = image.narrow(1, (int) bb.height, (int) bb.h0); 01445 input = input.narrow(2, (int) bb.width, (int) bb.w0); 01446 //input = input.shift_dim(0, 2); // put channels back to dimension 2 01447 odetections.push_back(input); 01448 } 01449 bodetections = true; 01450 return odetections; 01451 } 01452 01453 template <typename T, class Tstate> 01454 midx<T> detector<T,Tstate>::get_preprocessed(const bbox &bb) { 01455 mstate<Tstate> &ins = ppinputs[0]; 01456 mstate<Tstate> &outs = outputs[0]; 01457 // get bbox of input given output bbox and its offsets 01458 idxdim d(1, 1, 1); //bb.oheight, bb.owidth); 01459 d.setoffset(1, bb.o.h0); 01460 d.setoffset(2, bb.o.w0); 01461 mfidxdim md; 01462 for (uint i = 0; i < outs.size(); ++i) { 01463 if (i == (uint) bb.oscale_index) md.push_back(d); 01464 else md.push_back_empty(); 01465 } 01466 mfidxdim d2 = thenet.bprop_size(md); 01467 EDEBUG("get_preprocessed: bprop_size of " << md << " -> " << d2 01468 << " from outputs " << outs << " to input " << ins); 01469 // get bboxes after the resizepp 01470 mfidxdim dims = resizepp->get_msize(); 01471 if (dims.size() != ins.size()) 01472 eblerror("expected same size dimensions and ins but got " 01473 << dims.size() << " and " << ins.size()); 01474 midx<T> all(1); 01475 ins.get_padded_midx(dims, all); 01476 return all; 01477 } 01478 01479 template <typename T, class Tstate> 01480 svector<midx<T> >& detector<T,Tstate>:: 01481 get_preprocessed(bboxes &out, uint nmax, bool diverse, uint pre_diverse_max) { 01482 return get_preprocessed(pruned_bboxes, out, nmax, diverse, pre_diverse_max); 01483 } 01484 01485 template <typename T, class Tstate> 01486 svector<midx<T> >& detector<T,Tstate>:: 01487 get_preprocessed(bboxes &in, bboxes &out, uint nmax, bool diverse, 01488 uint pre_diverse_max) { 01489 // if (bppdetections) // recompute only if not up-to-date 01490 // return ppdetections; 01491 idx<T> input; 01492 size_t i; 01493 size_t n = in.size(); 01494 // limit number of samples fed to diversity if enabled 01495 if (diverse && pre_diverse_max > 0) { 01496 if (nmax > 0) pre_diverse_max = std::max(pre_diverse_max, nmax); 01497 n = std::min((size_t) pre_diverse_max, n); 01498 } 01499 01500 // clear vector 01501 ppdetections.clear(); 01502 out.clear(); 01503 // loop on bounding boxes 01504 for (i = 0; i < n; ++i) { 01505 bbox &bb = in[i]; 01506 midx<T> all = get_preprocessed(bb); 01507 ppdetections.push_back_new(all); 01508 // outs.push_back(out); 01509 out.push_back(bb); 01510 } 01511 // diverse ordering 01512 if (diverse) out.sort_by_difference(ppdetections); 01513 // cap to n 01514 if (nmax > 0 && nmax < ppdetections.size()) { 01515 ppdetections.erase(ppdetections.begin() + nmax, ppdetections.end()); 01516 out.erase(out.begin() + nmax, out.end()); 01517 } 01518 // return 01519 bppdetections = true; 01520 return ppdetections; 01521 } 01522 01523 template <typename T, class Tstate> 01524 idx<T> detector<T,Tstate>::get_mask(string &classname) { 01525 int id = get_class_id(classname); 01526 idxdim d(image.dim(1), image.dim(2)); 01527 if (mask.get_idxdim() != d) 01528 mask = idx<T>(d); 01529 if (id == -1) { // class not found 01530 merr << "warning: unknown class " << classname << endl; 01531 idx_clear(mask); 01532 return mask; 01533 } 01534 eblerror("get_mask temporarly broken, outputs is now multi-state"); 01535 // // merge all outputs of class 'id' into mask 01536 // for (uint i = 0; i < ppinputs.size(); ++i) { 01537 // Tstate &ppin = (*ppinputs[i])[0]; 01538 // idx<T> in = ppin.x.select(0, 0); 01539 // idx<T> out = outputs[i]->x.select(0, id); 01540 // rect<int> ob = original_bboxes[i]; 01541 // // resizing to inputs, then to original input, to avoid precision loss 01542 // out = image_resize(out, in.dim(0), in.dim(1), 1); 01543 // out = out.narrow(0, ob.height, ob.h0); 01544 // out = out.narrow(1, ob.width, ob.w0); 01545 // out = image_resize(out, mask.dim(0), mask.dim(1), 1); 01546 // if (i++ == 0) 01547 // idx_copy(out, mask); 01548 // else 01549 // idx_max(mask, out, mask); 01550 // } 01551 return mask; 01552 } 01553 01555 // processing 01556 01557 template <typename T, class Tstate> template <class Tin> 01558 void detector<T,Tstate>::prepare(idx<Tin> &img, const char *frame_name) { 01559 // tell detections vectors they are not up-to-date anymore 01560 bodetections = false; 01561 bppdetections = false; 01562 // deep copy to cast input into net's type and move channels to 1st dim 01563 if (img.order() == 2) { // 1 channel only 01564 image = idx<T>(1, img.dim(0), img.dim(1)); 01565 idx<T> tmp = image.select(0, 0); 01566 idx_copy(img, tmp); 01567 } else if (img.order() >= 3) { // multiple channels 01568 idx<Tin> tmp = img.shift_dim(2, 0); 01569 image = idx<T>(tmp.get_idxdim()); 01570 idx_copy(tmp, image); 01571 } else 01572 eblerror("expected at least 2 dimensions in input but got " << img); 01573 // if input size had changed, reinit resolutions 01574 if (!initialized || 01575 (!(indim == image.get_idxdim()) && restype != NETWORK)) { 01576 init(image.get_idxdim(), frame_name); 01577 } 01578 } 01579 01580 template <typename T, class Tstate> 01581 void detector<T,Tstate>::prepare_scale(uint i) { 01582 if (i >= scales.size()) 01583 eblthrow("cannot request scale " << i << ", there are only " 01584 << nscales << " scales"); 01585 // select input/outputs buffers 01586 // output = outputs[0]; 01587 if (!mem_optimization || keep_inputs) // we use different bufs for each i 01588 input = &finput; 01589 else 01590 input = minput; 01591 // set resizing of current scale 01592 idxdim d = scales[i]; 01593 resizepp->set_dimensions(d.dim(1), d.dim(2)); 01594 // // save actual resolutions 01595 // fidxdim tmp = d; 01596 // idxdim actual = thenet.fprop_size(tmp); 01597 // actual = thenet.bprop_size(actual); 01598 // actual_scales[i] = actual; 01599 // EDEBUG("requested resolution " << d << " at scale " << i 01600 // << ": actual res " << actual); 01601 } 01602 01603 template <typename T, class Tstate> 01604 void detector<T,Tstate>::multi_res_fprop() { 01605 // timing 01606 timer t; 01607 t.start(); 01608 for (uint i = 0; i < scales.size(); ++i) { 01609 prepare_scale(i); 01610 input->x = image; // put image in input state 01611 // keep a copy of preprocess' output if displaying 01612 if (!mem_optimization || keep_inputs) 01613 resizepp->set_output_copy(ppinputs[i]); 01614 // fprop 01615 mstate<Tstate> &out = outputs[0]; 01616 thenet.fprop(*input, out); 01617 EDEBUG("detector outputs: " << out); 01618 // outputs dumping 01619 if (!outputs_dump.empty()) { 01620 string fname = outputs_dump; 01621 if (out.size() == 1) { 01622 idx<T> &o = out[0].x; 01623 fname << "_" << o << ".mat"; 01624 save_matrix(o, fname); 01625 mout << "Saved " << fname << " (" << o << ", min: " << idx_min(o) 01626 << ", max: " << idx_max(o) << ")" << endl; 01627 } else { 01628 // TODO: write code to save multi-state x components 01629 } 01630 } 01631 // memorize original input's bbox in resized input 01632 rect<int> &bbox = original_bboxes[i]; 01633 rect<int> bb = resizepp->get_original_bbox(); 01634 bbox.h0 = bb.h0; 01635 bbox.w0 = bb.w0; 01636 bbox.height = bb.height; 01637 bbox.width = bb.width; 01638 01639 // #ifdef __DUMP_STATES__ 01640 // DUMP(output->x, "detector_output_"); 01641 // #endif 01642 01643 if (optimization_swap) { // swap output and input 01644 eblerror("mem optimization temporarly broken because out is now mstate"); 01645 // tmp = input; 01646 // input = output; 01647 // output = tmp; 01648 } 01649 } 01650 if (!silent) mout << "net_processing=" << t.elapsed_ms() << endl; 01651 } 01652 01653 } // end namespace ebl 01654 01655 #endif