libidx
|
00001 /*************************************************************************** 00002 * Copyright (C) 2008 by Yann LeCun and Pierre Sermanet * 00003 * yann@cs.nyu.edu, pierre.sermanet@gmail.com * 00004 * All rights reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions are met: 00008 * * Redistributions of source code must retain the above copyright 00009 * notice, this list of conditions and the following disclaimer. 00010 * * Redistributions in binary form must reproduce the above copyright 00011 * notice, this list of conditions and the following disclaimer in the 00012 * documentation and/or other materials provided with the distribution. 00013 * * Redistribution under a license not approved by the Open Source 00014 * Initiative (http://www.opensource.org) must display the 00015 * following acknowledgement in all advertising material: 00016 * This product includes software developed at the Courant 00017 * Institute of Mathematical Sciences (http://cims.nyu.edu). 00018 * * The names of the authors may not be used to endorse or promote products 00019 * derived from this software without specific prior written permission. 00020 * 00021 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED 00022 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00023 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00024 * DISCLAIMED. IN NO EVENT SHALL ThE AUTHORS BE LIABLE FOR ANY 00025 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00026 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00027 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00028 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00029 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 ***************************************************************************/ 00032 00033 #ifndef IDXOPS_HPP 00034 #define IDXOPS_HPP 00035 00036 using namespace std; 00037 00038 namespace ebl { 00039 00041 // idx_copy 00042 00043 // generic copy for two different types. 00044 //template<class T1, class T2> void idx_copy(idx<T1> &src, idx<T2> &dst) { 00048 // {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }} 00049 //} 00050 00052 //template<class T1, class T2> void idx_copy(idx<T1> &src, idx<T2> &dst) { 00056 // {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }} 00057 //} 00058 00059 00060 // generic copy for the same type. 00061 template<class T> void idx_copy(const idx<T> &src, idx<T> &dst) { 00062 idx_checknelems2_all(src, dst); 00063 // loop and copy 00064 if (src.order() == 0 && dst.order() == 0) 00065 *(dst.idx_ptr()) = *(src.idx_ptr()); 00066 else if (src.contiguousp() && dst.contiguousp()) { 00067 /* they are both contiguous: call the stride 1 routine */ 00068 memcpy(dst.idx_ptr(), src.idx_ptr(), src.nelements() * sizeof(T)); 00069 } else { 00070 // else, they don't have the same structure: do it "by hand". 00071 // This is slower 00072 {idx_aloop2(isrc, src, T, idst, dst, T) { *idst = *isrc; }} 00073 } 00074 } 00075 00076 template<class T1, class T2> void idx_copy(const idx<T1> &src, idx<T2> &dst){ 00077 idx_checknelems2_all(src, dst); 00078 // loop and copy 00079 idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); } 00080 } 00081 00082 template<class T1, class T2> idx<T1> idx_copy(const idx<T2> &src){ 00083 idx<T1> dst(src.get_idxdim()); 00084 idx_copy(src, dst); 00085 return dst; 00086 } 00087 00088 template<class T> idx<T> idx_copy(const idx<T> &src){ 00089 idx<T> dst(src.get_idxdim()); 00090 idx_copy(src, dst); 00091 return dst; 00092 } 00093 00094 template<class T1, class T2> 00095 void idx_copy_clip(const idx<T1> &src, idx<T2> &dst) { 00096 idx_checknelems2_all(src, dst); 00097 // loop and copy 00098 idx_aloop2(isrc, src, T1, idst, dst, T2) { 00099 *idst = saturate(*isrc, T2); 00100 } 00101 } 00102 00104 // idx_clear 00105 00106 template<class T> void idx_clear(idx<T> &inp) { 00107 if (inp.contiguousp()) 00108 memset(inp.idx_ptr(), 0, inp.nelements() * sizeof (T)); 00109 else 00110 idx_aloopf1(pinp, inp, T, { *pinp = 0; }); 00111 } 00112 00114 // idx_fill 00115 00116 template<class T, class T2> void idx_fill(idx<T> & inp, T2 v) { 00117 idx_fill(inp, saturate(v, T)); 00118 } 00119 00120 template<class T> void idx_fill(idx<T> &inp, T v) { 00121 idx_aloopf1(it, inp, T, {*it = v;}) 00122 } 00123 00124 template<class T> void idx_fill_index(idx<T> &inp) { 00125 T i = 0; 00126 idx_aloopf1(it, inp, T, {*it = i++;}) 00127 } 00128 00130 // idx_shuffle_* 00131 00132 // TODO: can n random swaps be as random? (it would be more efficient) 00133 template<class T> void idx_shuffle(idx<T> &in_, intg d, idx<T> *out_) { 00134 if (!drand_ini) 00135 cerr << "Warning: random not initialized, call dynamic_init_drand()" 00136 << endl; 00137 // if out exists, use it for output, otherwise create a temporary buffer 00138 // and put output back into in. 00139 idx<T> in, out; 00140 if (out_) { // use in_ as input and out_ as output 00141 // check that in_ and out_ are different 00142 if (&in_ == out_) 00143 eblerror("input and output idx should be different"); 00144 idx_checknelems2_all(in_, *out_); 00145 in = in_; 00146 out = *out_; 00147 } else { // otherwise, use in_ as output and a copy of _in as input 00148 idxdim indims(in_); 00149 in = idx<T>(indims); 00150 idx_copy(in_, in); 00151 out = in_; 00152 } 00153 // for each element of in, put it randomly in out. 00154 // if there is a collision, loop until the next available slot 00155 idx<T> tmpi, tmpo; 00156 idx<bool> assigned(in.dim(d)); 00157 idx_fill(assigned, false); 00158 intg pos; 00159 for (intg i = 0; i < in.dim(d); ++i) { 00160 pos = (intg) drand(0, in.dim(d) - 1); 00161 if (assigned.get(pos)) { // if already assigned, loop until free slot 00162 for (intg j = pos + 1; j != pos; ++j) { 00163 if (j >= in.dim(d)) j = 0; 00164 if (j == pos) eblerror("idx_shuffle: no available slot"); 00165 if (!assigned.get(j)) { 00166 pos = j; 00167 break ; 00168 } 00169 } 00170 } 00171 // copy ith element of in into pos^th element of out 00172 tmpi = in.select(d, i); 00173 tmpo = out.select(d, pos); 00174 idx_copy(tmpi, tmpo); 00175 assigned.set(true, pos); 00176 } 00177 } 00178 00179 // TODO: can n random swaps be as random? (it would be more efficient) 00180 template<class T1, class T2> 00181 void idx_shuffle_together(idx<T1> &in1_, idx<T2> &in2_, intg d, 00182 idx<T1> *out1_, idx<T2> *out2_) { 00183 if (!drand_ini) 00184 cerr << "Warning: random not initialized, call dynamic_init_drand()" 00185 << endl; 00186 idx_checkdim2_all(in1_, in2_, d); // size of dim d must match of in1 and in2 00187 // if out exists, use it for output, otherwise create a temporary buffer 00188 // and put output back into in. 00189 idx<T1> in1, out1; 00190 idx<T2> in2, out2; 00191 if (out1_) { // use in_ as input and out_ as output 00192 // check that in_ and out_ are different 00193 if (&in1_ == out1_) 00194 eblerror("input and output idx should be different"); 00195 idx_checknelems2_all(in1_, *out1_); 00196 in1 = in1_; 00197 out1 = *out1_; 00198 } else { // otherwise, use in_ as output and a copy of _in as input 00199 idxdim indims(in1_); 00200 in1 = idx<T1>(indims); 00201 idx_copy(in1_, in1); 00202 out1 = in1_; 00203 } 00204 if (out2_) { // use in_ as input and out_ as output 00205 if (&in2_ == out2_) 00206 eblerror("input and output idx should be different"); 00207 idx_checknelems2_all(in2_, *out2_); 00208 in2 = in2_; 00209 out2 = *out2_; 00210 } else { // otherwise, use in_ as output and a copy of _in as input 00211 idxdim indims(in2_); 00212 in2 = idx<T2>(indims); 00213 idx_copy(in2_, in2); 00214 out2 = in2_; 00215 } 00216 // for each element of in, put it randomly in out. 00217 // if there is a collision, loop until the next available slot 00218 idx<T1> tmpi1, tmpo1; 00219 idx<T2> tmpi2, tmpo2; 00220 idx<bool> assigned(in1.dim(d)); 00221 idx_fill(assigned, false); 00222 intg pos; 00223 for (intg i = 0; i < in1.dim(d); ++i) { 00224 pos = (intg) drand(0, in1.dim(d) - 1); 00225 if (assigned.get(pos)) { // if already assigned, loop until free slot 00226 for (intg j = pos + 1; j != pos; ++j) { 00227 if (j >= in1.dim(d)) j = 0; 00228 if (j == pos) eblerror("no available slot"); 00229 if (!assigned.get(j)) { 00230 pos = j; 00231 break ; 00232 } 00233 } 00234 } 00235 // copy ith element of in into pos^th element of out 00236 tmpi1 = in1.select(d, i); 00237 tmpo1 = out1.select(d, pos); 00238 idx_copy(tmpi1, tmpo1); 00239 tmpi2 = in2.select(d, i); 00240 tmpo2 = out2.select(d, pos); 00241 idx_copy(tmpi2, tmpo2); 00242 assigned.set(true, pos); 00243 } 00244 } 00245 00246 // TODO: can n random swaps be as random? (it would be more efficient) 00247 template<class T1, class T2, class T3> 00248 void idx_shuffle_together(idx<T1> &in1_, idx<T2> &in2_, idx<T3> &in3_, 00249 intg d, 00250 idx<T1> *out1_, idx<T2> *out2_, idx<T3> *out3_) { 00251 if (!drand_ini) 00252 cerr << "Warning: random not initialized, call dynamic_init_drand()" 00253 << endl; 00254 // size of dim d must match of in1 and in2 and in3 00255 idx_checkdim3_all(in1_, in2_, in3_, d); 00256 // if out exists, use it for output, otherwise create a temporary buffer 00257 // and put output back into in. 00258 idx<T1> in1, out1; 00259 idx<T2> in2, out2; 00260 idx<T3> in3, out3; 00261 if (out1_) { // use in_ as input and out_ as output 00262 if (&in1_ == out1_) 00263 eblerror("input and output idx should be different"); 00264 idx_checknelems2_all(in1_, *out1_); 00265 in1 = in1_; 00266 out1 = *out1_; 00267 } else { // otherwise, use in_ as output and a copy of _in as input 00268 idxdim indims(in1_); 00269 in1 = idx<T1>(indims); 00270 idx_copy(in1_, in1); 00271 out1 = in1_; 00272 } 00273 if (out2_) { // use in_ as input and out_ as output 00274 if (&in2_ == out2_) 00275 eblerror("input and output idx should be different"); 00276 idx_checknelems2_all(in2_, *out2_); 00277 in2 = in2_; 00278 out2 = *out2_; 00279 } else { // otherwise, use in_ as output and a copy of _in as input 00280 idxdim indims(in2_); 00281 in2 = idx<T2>(indims); 00282 idx_copy(in2_, in2); 00283 out2 = in2_; 00284 } 00285 if (out3_) { // use in_ as input and out_ as output 00286 if (&in3_ == out3_) 00287 eblerror("input and output idx should be different"); 00288 idx_checknelems2_all(in3_, *out3_); 00289 in3 = in3_; 00290 out3 = *out3_; 00291 } else { // otherwise, use in_ as output and a copy of _in as input 00292 idxdim indims(in3_); 00293 in3 = idx<T3>(indims); 00294 idx_copy(in3_, in3); 00295 out3 = in3_; 00296 } 00297 // for each element of in, put it randomly in out. 00298 // if there is a collision, loop until the next available slot 00299 idx<T1> tmpi1, tmpo1; 00300 idx<T2> tmpi2, tmpo2; 00301 idx<T3> tmpi3, tmpo3; 00302 idx<bool> assigned(in1.dim(d)); 00303 idx_fill(assigned, false); 00304 intg pos; 00305 for (intg i = 0; i < in1.dim(d); ++i) { 00306 pos = (intg) drand(0, in1.dim(d) - 1); 00307 if (assigned.get(pos)) { // if already assigned, loop until free slot 00308 for (intg j = pos + 1; j != pos; ++j) { 00309 if (j >= in1.dim(d)) j = 0; 00310 if (j == pos) eblerror("no available slot"); 00311 if (!assigned.get(j)) { 00312 pos = j; 00313 break ; 00314 } 00315 } 00316 } 00317 // copy ith element of in into pos^th element of out 00318 tmpi1 = in1.select(d, i); 00319 tmpo1 = out1.select(d, pos); 00320 idx_copy(tmpi1, tmpo1); 00321 tmpi2 = in2.select(d, i); 00322 tmpo2 = out2.select(d, pos); 00323 idx_copy(tmpi2, tmpo2); 00324 tmpi3 = in3.select(d, i); 00325 tmpo3 = out3.select(d, pos); 00326 idx_copy(tmpi3, tmpo3); 00327 assigned.set(true, pos); 00328 } 00329 } 00330 00332 // idx_minus 00333 00334 template<class T> void idx_minus(idx<T> &inp, idx<T> &out) { 00335 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = - *pinp; }); 00336 } 00337 00339 // idx_minus_acc 00340 00341 template<class T> void idx_minus_acc(idx<T> &inp, idx<T> &out) { 00342 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += - *pinp; }); 00343 } 00344 00346 // idx_inv 00347 00348 template<class T> void idx_inv(idx<T> &inp, idx<T> &out) { 00349 #ifdef __DEBUG__ 00350 idx_aloopf2(pinp, inp, T, pout, out, T, { 00351 if (*pinp == 0) eblerror("division by zero"); 00352 *pout = 1 / *pinp; 00353 }); 00354 #else 00355 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = 1 / *pinp; }); 00356 #endif 00357 } 00358 00360 // idx_add 00361 00362 template<class T> void idx_add(idx<T> &in, idx<T> &out) { 00363 idx_aloopf2 (pin, in, T, pout, out, T, { *pout = *pout + *pin; }); 00364 } 00365 00366 template<typename T> void idx_add(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00367 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { *pout = *pi1 + *pi2; }); 00368 } 00369 00371 // idx_sub 00372 00373 template<class T> void idx_sub(idx<T> &i1, idx<T> &i2) { 00374 idx_aloopf2(pi1, i1, T, pi2, i2, T, { *pi1 = *pi1 - *pi2; }); 00375 } 00376 00377 template<class T> void idx_sub(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00378 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {*pout = *pi1 - *pi2; }); 00379 } 00380 00381 template<class T> void idx_subacc(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00382 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {*pout += *pi1 - *pi2; }); 00383 } 00384 00386 // idx_spherical_sub 00387 00388 template<class T> void idx_spherical_sub(idx<T> &i1, idx<T> &i2, idx<T> &out){ 00389 T tmp, tmpabs, tmpabs2; 00390 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00391 tmp = *pi1 - *pi2; 00392 tmpabs = abs(tmp); 00393 tmpabs2 = TWOPI - tmpabs; 00394 if (tmpabs > tmpabs2) { 00395 if (tmp < 0) 00396 tmp = tmpabs2; 00397 else 00398 tmp = -tmpabs2; 00399 } 00400 *pout = tmp; 00401 }); 00402 } 00403 00405 // idx_spherical_add 00406 00407 template<class T> void idx_spherical_add(idx<T> &i1, idx<T> &i2, idx<T> &out){ 00408 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00409 *pout = std::min(TWOPI - *pi1, *pi1) + std::min(TWOPI - *pi2, - *pi2); 00410 }); 00411 } 00412 00414 // idx_mul 00415 00416 template<class T> void idx_mul(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00417 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, 00418 {*pout = (*pi1) * (*pi2); }); 00419 } 00420 00421 template<class T> void idx_mulacc(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00422 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, 00423 {*pout += (*pi1) * (*pi2); }); 00424 } 00425 00427 // idx_div 00428 00429 template<class T> void idx_div(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00430 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00431 *pout = (*pi1) / (*pi2); 00432 }); 00433 // since we cannot put a ifdef in a macro, we repeat the code here 00434 #ifdef __DEBUG__ 00435 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00436 if (*pi2 == 0) 00437 eblerror("division by zero"); 00438 *pout = (*pi1) / (*pi2); 00439 }); 00440 #endif 00441 } 00442 00444 // idx_addc 00445 00446 template<class T, class T2> void idx_addc(idx<T> & inp, T2 c, idx<T> & out) { 00447 idx_addc(inp, saturate(c, T), out); 00448 } 00449 00450 template<class T> void idx_addc(idx<T> &inp, T c, idx<T> &out) { 00451 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = *pinp + c; }); 00452 } 00453 00454 template<class T> void idx_addc(idx<T> &inp, T c) { 00455 idx_aloopf1(pinp, inp, T, { *pinp += c; }); 00456 } 00457 00459 // idx_addc_bounded 00460 00461 template<class T, class T2> 00462 void idx_addc_bounded(idx<T> & inp, T2 c, idx<T> & out) { 00463 idx_addc_bounded(inp, saturate(c, T), out); 00464 } 00465 00466 00467 template<class T> void idx_addc_bounded(idx<T> &inp, T c, idx<T> &out) { 00468 idx_aloopf2(pinp, inp, T, pout, out, T, { 00469 *pout = saturate(*pinp + c, T); 00470 }); 00471 } 00472 00474 // idx_subc_bounded 00475 00476 template<class T, class T2> 00477 void idx_subc_bounded(idx<T> &inp, T2 c, idx<T> &out) { 00478 idx_subc_bounded(inp, saturate(c, T), out); 00479 } 00480 00481 00482 template<class T> void idx_subc_bounded(idx<T> &inp, T c, idx<T> &out) { 00483 idx_aloopf2(pinp, inp, T, pout, out, T, { 00484 *pout = saturate(*pinp - c, T); 00485 }); 00486 } 00487 00489 // idx_addcacc 00490 00491 template<class T, class T2> 00492 void idx_addcacc(idx<T> & inp, T2 c, idx<T> & out) { 00493 idx_addcacc(inp, saturate(c, T), out); 00494 } 00495 00496 template<class T> void idx_addcacc(idx<T> &inp, T c, idx<T> &out) { 00497 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += *pinp + c; }); 00498 } 00499 00501 // idx_dotc 00502 00503 template<class T, class T2> void idx_dotc(idx<T> &inp, T2 c, idx<T> &out) { 00504 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = (T)(*pinp * c); }); 00505 } 00506 00508 // idx_dotc_bounded 00509 00510 template<class T, class T2> 00511 void idx_dotc_bounded(idx<T> &inp, T2 c, idx<T> &out) { 00512 idx_aloopf2(pinp, inp, T, pout, out, T, { 00513 *pout = saturate(*pinp * c, T); 00514 }); 00515 } 00516 00518 // idx_dotcacc 00519 00520 template<class T, class T2> void idx_dotcacc(idx<T> &inp, T2 c, idx<T> &out) { 00521 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += (T)(*pinp * c); }); 00522 } 00523 00525 // idx_signdotc 00526 00527 template<class T, class T2> 00528 void idx_signdotc(idx<T> &inp, T2 c, idx<T> &out) { 00529 idx_digndotc(inp, saturate(c, T), out); 00530 } 00531 00532 template<class T> void idx_signdotc(idx<T> &inp, T c, idx<T> &out) { 00533 idx_aloopf2(pinp, inp, T, pout, out, T, { 00534 *pout = (*pinp < 0) ? -c : c; 00535 }); 00536 } 00537 00539 // idx_signdotcacc 00540 00541 template<class T, class T2> 00542 void idx_signdotcacc(idx<T> &inp, T2 c, idx<T> &out) { 00543 idx_digndotcacc(inp, saturate(c, T), out); 00544 } 00545 00546 template<class T> void idx_signdotcacc(idx<T> &inp, T c, idx<T> &out) { 00547 idx_aloopf2(pinp, inp, T, pout, out, T, { 00548 *pout += (*pinp < 0) ? -c : c; 00549 }); 00550 } 00551 00553 // idx_subsquare 00554 00555 template<class T> void idx_subsquare(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00556 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00557 T d = *pi1 - *pi2; 00558 *pout = d*d; 00559 }); 00560 } 00561 00562 template<class T> void idx_subsquareacc(idx<T> &i1, idx<T> &i2, idx<T> &out) { 00563 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00564 T d = *pi1 - *pi2; 00565 *pout += d*d; 00566 }); 00567 } 00568 00569 // idx_lincom //////////////////////////////////////////////////////////////// 00570 00571 // not very efficient. There must be a more parallel way of doing this 00572 template<class T> 00573 void idx_lincomb(idx<T> &i1, T k1, idx<T> &i2, T k2, idx<T> &out) { 00574 idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { 00575 *pout = k1 * (*pi1) + k2 * (*pi2); 00576 }); 00577 } 00578 00579 // idx_tanh ////////////////////////////////////////////////////////////////// 00580 00581 template<class T> void idx_tanh(idx<T> &inp, idx<T> &out) { 00582 idx_checknelems2_all(inp, out); 00583 #ifdef __OPENMP__ 00584 T* inptr = inp.idx_ptr(); 00585 T* outptr = out.idx_ptr(); 00586 #pragma omp parallel for 00587 for(int i=0; i< inp.nelements(); ++i) { 00588 outptr[i] = (T)(tanh((double)inptr[i])); 00589 } 00590 #else 00591 idx_aloopf2(pinp, inp, T, pout, out, T, { 00592 *pout = (T)(tanh((double)*pinp)); 00593 }); 00594 #endif 00595 } 00596 00597 // idx_dtanh ///////////////////////////////////////////////////////////////// 00598 00599 template<class T> void idx_dtanh(idx<T> &inp, idx<T> &out) { 00600 idx_checknelems2_all(inp, out); 00601 idx_aloopf2(pinp, inp, T, pout, out, T, { 00602 *pout = (T) (dtanh((double)*pinp)); 00603 }); 00604 } 00605 00606 // idx_stdsigmoid //////////////////////////////////////////////////////////// 00607 00608 template<class T> void idx_stdsigmoid(idx<T> &inp, idx<T> &out) { 00609 idx_checknelems2_all(inp, out); 00610 idx_aloopf2(pinp, inp, T, pout, out, T, { 00611 *pout = (T) (stdsigmoid((double)*pinp)); 00612 }); 00613 } 00614 00615 // idx_dstdsigmoid /////////////////////////////////////////////////////////// 00616 00617 template<class T> void idx_dstdsigmoid(idx<T> &inp, idx<T> &out) { 00618 idx_checknelems2_all(inp, out); 00619 idx_aloopf2(pinp, inp, T, pout, out, T, { 00620 *pout = (T) (dstdsigmoid((double)*pinp)); 00621 }); 00622 } 00623 00624 // idx_abs /////////////////////////////////////////////////////////////////// 00625 00626 template<class T> inline T abs2 (T a) { 00627 return (T)abs((float64)a); 00628 } 00629 template<> inline float abs2 (float a) { 00630 return fabs(a); 00631 } 00632 template<> inline byte abs2 (byte a) { 00633 if (a >= 0) 00634 return a; 00635 return (a == -128) ? 127 : -a; 00636 } 00637 template<> inline int16 abs2 (int16 a) { 00638 if (a >= 0) 00639 return a; 00640 return (a == -32768) ? 32767 : -a; 00641 } 00642 00643 template<class T> void idx_abs(idx<T>& inp, idx<T>& out) { 00644 idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = abs2<T>(*pinp); }); 00645 } 00646 00647 00649 // idx_threshold_acc 00650 00651 template<class T, class T2, class T3> 00652 void idx_thresdotc_acc(idx<T>& in, T2 c, T3 th, idx<T>& out) { 00653 idx_thresdotc_acc(in, saturate(c, T), saturate(th, T), out); 00654 } 00655 00656 template<class T> void idx_thresdotc_acc(idx<T>& in, T c, T th, idx<T>& out) { 00657 idx_aloopf2(pin, in, T, pout, out, T, { 00658 *pout += (*pin < -th) ? -c : (*pin > th) ? c : 0; 00659 }); 00660 } 00661 00663 // idx_threshold (in-place) 00664 00665 template<class T, class T2> void idx_threshold(idx<T> & in, T2 th) { 00666 idx_threshold(in, saturate(th, T)); 00667 } 00668 00669 template<class T> void idx_threshold(idx<T>& in, T th) { 00670 idx_aloopf1(pin, in, T, { if (*pin < th) *pin = th; }); 00671 } 00672 00673 template<class T> void idx_threshold2(idx<T>& in, T th) { 00674 idx_aloopf1(pin, in, T, { if (*pin > th) *pin = th; }); 00675 } 00676 00678 // idx_threshold (not-in-place) 00679 00680 template<class T, class T2> 00681 void idx_threshold(idx<T> & in, T2 th, idx<T> & out) { 00682 idx_threshold(in, saturate(th, T), out); 00683 } 00684 00685 template<class T> void idx_threshold(idx<T>& in, T th, idx<T>& out) { 00686 idx_checknelems2_all(in, out); 00687 idx_aloopf2(pin, in, T, pout, out, T, { 00688 if (*pin < th) 00689 *pout = th; 00690 else 00691 *pout = *pin; 00692 }); 00693 } 00694 00696 // idx_threshold (in-place, with value) 00697 00698 template<class T, class T2, class T3> 00699 void idx_threshold(idx<T> & in, T2 th, T3 value) { 00700 idx_threshold(in, saturate(th, T), saturate(value, T)); 00701 } 00702 00703 template<class T> void idx_threshold(idx<T>& in, T th, T value) { 00704 idx_aloopf1(pin, in, T, { if (*pin < th) *pin = value; }); 00705 } 00706 00708 // idx_threshold (not-in-place, with value) 00709 00710 template<class T, class T2, class T3> 00711 void idx_threshold(idx<T> & in, T2 th, T3 value, idx<T> & out) { 00712 idx_threshold(in, saturate(th, T), saturate(value, T), out); 00713 } 00714 00715 00716 template<class T> void idx_threshold(idx<T>& in, T th, T value, idx<T>& out) { 00717 idx_aloopf2(pin, in, T, pout, out, T, { 00718 if (*pin < th) 00719 *pout = value; 00720 else 00721 *pout = *pin; 00722 }); 00723 } 00724 00726 // idx_threshold (with above and below) 00727 00728 template<class T, class T2> 00729 void idx_threshold(idx<T>& in, T th, T2 below, T2 above, idx<T2>& out) { 00730 idx_checknelems2_all(in, out); 00731 idx_aloopf2(pin, in, T, pout, out, T2, { 00732 *pout = (*pin < th) ? below : above; 00733 }); 00734 } 00735 00737 // idx_sqrt 00738 00739 template<class T> void idx_sqrt(idx<T>& in, idx<T>& out) { 00740 idx_checknelems2_all(in, out); 00741 idx_aloopf2(pin, in, T, pout, out, T, { 00742 *pout = (T)sqrt((float64)(*pin)); 00743 }); 00744 } 00745 00747 // idx_power 00748 00749 template<class T, class T2> void idx_power(idx<T> & in, T2 p, idx<T> & out) { 00750 idx_checknelems2_all(in, out); 00751 idx_power(in, saturate(p, T), out); 00752 } 00753 00754 template<class T> void idx_power(idx<T>& in, T p, idx<T>& out) { 00755 idx_checknelems2_all(in, out); 00756 idx_aloopf2(pin, in, T, pout, out, T, { 00757 *pout = saturate(pow((double)(*pin), (double)p), T); 00758 }); 00759 } 00760 00762 // idx_sum 00763 //#ifndef __OPENMP__ 00764 00765 // there is a much faster and parallel way 00766 // of doing this using a tree. 00767 template<typename Tout, typename T> Tout idx_sum(idx<T> &inp) { 00768 Tout z = 0; 00769 idx_aloopf1(pinp, inp, T, { z += (Tout)(*pinp); }); 00770 return z; 00771 } 00772 00773 // there is a much faster and parallel way 00774 // of doing this using a tree. 00775 template<class T> T idx_sum(idx<T> &inp, T *out) { 00776 T z = idx_sum(inp); 00777 if (out != NULL) 00778 *out = saturate(z, T); 00779 return z; 00780 } 00781 00782 // there is a much faster and parallel way 00783 // of doing this using a tree. 00784 template<class T> T idx_sum(idx<T> &inp) { 00785 T z = 0; 00786 idx_aloopf1(pinp, inp, T, { z += (T)(*pinp); }); 00787 return z; 00788 } 00789 00790 // #endif /* ifndef __OPENMP__ */ 00791 00793 // idx_sumabs 00794 00795 // there is a much faster and parallel way 00796 // of doing this using a tree. 00797 template<class T> float64 idx_sumabs(idx<T> &inp, T *out) { 00798 float64 z = 0; 00799 idx_aloopf1(pinp, inp, T, { z += abs((float64)(*pinp)); }); 00800 if (out != NULL) { 00801 *out = saturate(z, T); 00802 return *out; 00803 } 00804 return z; 00805 } 00806 00808 // idx_sumsqr 00809 00810 // there is a much faster and parallel way 00811 // of doing this using a tree. 00812 template<class T> float64 idx_sumsqr(idx<T> &inp) { 00813 float64 z = 0; 00814 idx_aloopf1(pinp, inp, T, { z += ((float64)(*pinp))*((float64)(*pinp)); }); 00815 return z; 00816 } 00817 00819 // idx_sumacc 00820 00821 template<class T> float64 idx_sumacc(idx<T> &inp, idx<T> &acc) { 00822 // acc must be of order 0. 00823 if (acc.order() != 0) 00824 eblerror("expecting an idx0 as output"); 00825 float64 sum = (float64)acc.get() + idx_sum(inp); 00826 acc.set(saturate(sum, T)); 00827 return sum; 00828 //return idx_sum(inp, acc.ptr()); 00829 } 00830 00832 // idx_sumabs (with acc) 00833 00834 template<class T> float64 idx_sumabs(idx<T> &inp, idx<T> &acc) { 00835 // acc must be of order 0. 00836 if (acc.order() != 0) 00837 eblerror("expecting an idx0 as output"); 00838 float64 sum = (float64)acc.get() + idx_sumabs(inp); 00839 acc.set(saturate(sum, T)); 00840 return sum; 00841 //return idx_sumabs(inp, acc.ptr()); 00842 } 00843 00845 // idx_l1 00846 00847 template<class T> float64 idx_l1(idx<T> &m1, idx<T> &m2) { 00848 idx_checknelems2_all(m1, m2); 00849 float64 z = 0; 00850 idx_aloopf2(pm1, m1, T, pm2, m2, T, 00851 { z += (float64) fabs((float64) (*pm1 - *pm2)); }); 00852 return z; 00853 } 00854 00856 // idx_l2norm 00857 00858 template<class T> float64 idx_l2norm(idx<T> &in) { 00859 return sqrt((T) idx_sumsqr(in)); 00860 } 00861 00862 template<class T> float64 idx_l2(idx<T> &in) { 00863 return sqrt((T) idx_sumsqr(in)); 00864 } 00865 00866 template<class T> float64 idx_l2(idx<T> &m1, idx<T> &m2) { 00867 idx_checknelems2_all(m1, m2); 00868 float64 z = 0; 00869 float64 tmp; 00870 idx_aloopf2(pm1, m1, T, pm2, m2, T, 00871 { tmp = *pm1 - (float64) *pm2; 00872 z += tmp * tmp; }); 00873 return sqrt(z); 00874 } 00875 00876 template<class T> float64 idx_l2(midx<T> &m1, midx<T> &m2) { 00877 idx_checknelems2_all(m1, m2); 00878 float64 z = 0; 00879 idx_aloopf2(pe1, ((idx<idx<T>*>&) m1), idx<T>*, 00880 pe2, ((idx<idx<T>*>&) m2), idx<T>*, { 00881 idx<T> *e1 = *pe1; idx<T> *e2 = *pe2; 00882 if (e1 && e2) 00883 z += idx_l2(*e1, *e2); 00884 else if (e1 != e2) 00885 eblerror("expected both matrices to be empty in " << m1 00886 << " and " << m2); 00887 }); 00888 return sqrt(z); 00889 } 00890 00891 template<class T> float64 idx_l2common(midx<T> &m1, midx<T> &m2) { 00892 idx_checknelems2_all(m1, m2); 00893 float64 z = 0; 00894 idx_aloopf2(pe1, ((idx<idx<T>*>&) m1), idx<T>*, 00895 pe2, ((idx<idx<T>*>&) m2), idx<T>*, { 00896 idx<T> *e11 = *pe1; idx<T> *e22 = *pe2; 00897 if (e11 && e22) { 00898 idx<T> e1 = *e11; idx<T> e2 = *e22; 00899 idx_checkorder2_all(e1, e2); 00900 for (intg i = 0; i < e1.order(); ++i) { 00901 e1 = e1.narrow(i, std::min(e1.dim(i), e2.dim(i)), 0); 00902 e2 = e2.narrow(i, std::min(e1.dim(i), e2.dim(i)), 0); 00903 } 00904 z += idx_l2(e1, e2); 00905 } 00906 }); 00907 return sqrt(z); 00908 } 00909 00911 // idx_mean 00912 00913 template<class T> T idx_mean(idx<T> &in, T *out) { 00914 if (out != NULL) { 00915 *out = (T)(idx_sum(in) / (float64)in.nelements()); 00916 return *out; 00917 } 00918 return (T)(idx_sum(in) / (float64)in.nelements()); 00919 } 00920 00922 // idx_std_normalize 00923 00924 template<typename T> 00925 void idx_std_normalize(idx<T> &in, idx<T> &out, T *mean_) { 00926 T mean = mean_ ? *mean_ : idx_mean(in); 00927 idx_addc(in, (T)-mean, out); // remove mean 00928 // std deviation 00929 #ifdef __WINDOWS__ 00930 T coeff = (T) sqrt((double) (idx_sumsqr(out) / out.nelements())); 00931 #else 00932 T coeff = (T) sqrt((T) (idx_sumsqr(out) / out.nelements())); 00933 #endif 00934 if (coeff != 0) 00935 idx_dotc(out, 1 / coeff, out); 00936 } 00937 00939 // idx_flip 00940 00941 template <class T> idx<T> idx_flip(idx<T> &m, uint n, idx<T> *m2) { 00942 if (m2 == NULL) { 00943 idx<T> flipped(m.get_idxdim()); 00944 return idx_flip(m, n, &flipped); 00945 } else { 00946 if (!m.same_dim(m2->get_idxdim())) { 00947 eblerror("expected same dim idx in idx_flip, but got " << m 00948 << " and " << *m2); 00949 } else { 00950 if (n == 0) { 00951 // we reached the dimension we want to flip, flip it 00952 for (intg i = 0; i < m.dim(0); ++i) { 00953 idx<T> a = m.select(0, i); 00954 idx<T> b = m2->select(0, m.dim(0) - i - 1); 00955 idx_copy(a, b); 00956 } 00957 } else { 00958 // we didn't reach the dimension to flip, call another recursion 00959 idx_bloop2(mm, m, T, mm2, *m2, T) { 00960 idx_flip(mm, n - 1, &mm2); 00961 } 00962 } 00963 } 00964 return *m2; 00965 } 00966 } 00967 00968 template <class T> midx<T> idx_flip(midx<T> &m, uint n, midx<T> *m2) { 00969 if (m2 == NULL) { 00970 midx<T> flipped(m.get_idxdim()); 00971 // allocate all elements 00972 flipped.resize(m); 00973 return idx_flip(m, n, &flipped); 00974 } else { 00975 if (!m.same_dim(*m2)) { 00976 eblerror("expected same dim idx in idx_flip, but got " << m 00977 << " and " << *m2); 00978 } else { 00979 // loop on each object of m and flip it 00980 if (m.order() == 1) { 00981 for (intg i = 0; i < m.dim(0); ++i) { 00982 idx<T> tmp = m.get(i); 00983 tmp = idx_flip(tmp, n); 00984 m2->set(tmp, i); 00985 } 00986 } else if (m.order() == 2) { 00987 for (intg i = 0; i < m.dim(0); ++i) 00988 for (intg j = 0; j < m.dim(1); ++j) { 00989 idx<T> tmp = m.get(i, j); 00990 tmp = idx_flip(tmp, n); 00991 m2->set(tmp, i, j); 00992 } 00993 } 00994 } 00995 return *m2; 00996 } 00997 } 00998 00999 template<class T> void rev_idx2 (idx<T> &m) { 01000 idx_check_contiguous1(m); 01001 if (m.order() != 2) 01002 idx_compatibility_error1(m, "expecting idx of order 2"); 01003 T tmp, *p = m.idx_ptr(); 01004 intg size = m.dim(0) * m.dim(1); 01005 intg i; 01006 01007 for (i = 0; i < size/2; i++) { 01008 tmp = p[i]; 01009 p[i] = p[size-i-1]; 01010 p[size-i-1] = tmp; 01011 } 01012 } 01013 01014 template<class T> void rev_idx2_tr (idx<T> &m, idx<T> &n) { 01015 idx_check_contiguous2(m, n); 01016 if ((m.order() != 2) || (n.order() != 2)) 01017 idx_compatibility_error2(m, n, "expecting idx of order 2"); 01018 intg size = m.dim(0) * m.dim(1); 01019 T *p = m.idx_ptr() + size - 1; 01020 T *q = n.idx_ptr(); 01021 for (int i = 0; i < size; i++) { 01022 *(q++) = *(p--); 01023 } 01024 } 01025 01027 // idx's products 01028 01029 /* multiply M2 by M1, result in M1 */ 01030 /* matrix - vector product */ 01031 template<class T> 01032 void idx_m2dotm1(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01033 T *c1, *c2, *c1_0, *ker; 01034 intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0); 01035 intg j, jmax = i2.dim(0); 01036 intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0); 01037 T *d1, f; 01038 intg i, imax = o1.dim(0); 01039 c1_0 = i1.idx_ptr(); 01040 ker = i2.idx_ptr(); 01041 d1 = o1.idx_ptr(); 01042 for (i=0; i<imax; i++){ 01043 f = 0; 01044 c1 = c1_0; 01045 c2 = ker; 01046 if(c1_m1==1 && c2_m0==1) 01047 for (j=0; j<jmax; j++) 01048 f += (*c1++)*(*c2++); 01049 else 01050 for (j=0; j<jmax; j++) { 01051 f += (*c1)*(*c2); 01052 c1 += c1_m1; 01053 c2 += c2_m0; 01054 } 01055 *d1 = f; 01056 d1 += d1_m0; 01057 c1_0 += c1_m0; 01058 } 01059 } 01060 01061 /* multiply M2 by M1, result in M1 */ 01062 template <class T> 01063 void idx_m2dotm1acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01064 T *c1, *c2, *c1_0, *ker; 01065 intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0); 01066 intg j, jmax = i2.dim(0); 01067 intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0); 01068 T *d1, f; 01069 intg i, imax = o1.dim(0); 01070 c1_0 = i1.idx_ptr(); 01071 ker = i2.idx_ptr(); 01072 d1 = o1.idx_ptr(); 01073 for (i=0; i<imax; i++){ 01074 f = *d1; 01075 c1 = c1_0; 01076 c2 = ker; 01077 for (j=0; j<jmax; j++) { 01078 f += (*c1)*(*c2); 01079 c1 += c1_m1; 01080 c2 += c2_m0; 01081 } 01082 *d1 = f; 01083 d1 += d1_m0; 01084 c1_0 += c1_m0; 01085 } 01086 } 01087 01088 template <class T> 01089 void idx_m2dotm3(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01090 idx_bloop2(ii2, i2, T, oo1, o1, T) { 01091 idx_bloop2(iii2, ii2, T, ooo1, oo1, T) { 01092 idx_m2dotm1(i1, iii2, ooo1); 01093 } 01094 } 01095 } 01096 01097 // m2dotm2 /////////////////////////////////////////////////////////////////// 01098 01099 template <class T> 01100 void idx_m2dotm2(idx<T> &i1, idx<T> &i2, idx<T> &o) { 01101 idx_checkorder3(i1, 2, i2, 2, o, 2); 01102 if (i1.dim(1) != i2.dim(0) || i1.dim(0) != o.dim(0) || 01103 o.dim(1) != i2.dim(1)) 01104 eblerror("incompatible dimensions for matrix-matrix multiplication of " 01105 << i1 << " . " << i2 << " -> " << o); 01106 T *c1, *c2, *c1_0, *ker; 01107 intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0); 01108 intg j, jmax = i2.dim(0); 01109 intg c1_m0 = i1.mod(0), d1_m0 = o.mod(0); 01110 T *d1, f; 01111 intg i, imax = o.dim(0); 01112 intg k, kmax = o.dim(1); 01113 // loop on o.dim(1) 01114 for (k = 0; k < kmax; ++k) { 01115 c1_0 = i1.idx_ptr(); 01116 d1 = o.idx_ptr() + k; 01117 ker = i2.idx_ptr() + k; 01118 // loop on o.dim(0) 01119 for (i=0; i<imax; i++){ 01120 f = 0; 01121 c1 = c1_0; 01122 c2 = ker; 01123 // loop on 01124 if(c1_m1==1 && c2_m0==1) 01125 for (j=0; j<jmax; j++) 01126 f += (*c1++)*(*c2++); 01127 else 01128 for (j=0; j<jmax; j++) { 01129 f += (*c1)*(*c2); 01130 c1 += c1_m1; 01131 c2 += c2_m0; 01132 } 01133 *d1 = f; 01134 d1 += d1_m0; 01135 c1_0 += c1_m0; 01136 } 01137 } 01138 } 01139 01140 // m4dotm2 /////////////////////////////////////////////////////////////////// 01141 01142 // TODO-0 write specialized blas version in cpp 01143 template<class T> void idx_m4dotm2(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01144 idx_checkorder3(i1, 4, i2, 2, o1, 2); // check for compatible orders 01145 if ((i1.dim(0) != o1.dim(0)) || (i1.dim(1) != o1.dim(1)) 01146 || (i1.dim(2) != i2.dim(0)) || (i1.dim(3) != i2.dim(1))) 01147 idx_compatibility_error3(i1, i2, o1, "incompatible dimensions"); 01148 T *c1, *c1_2; 01149 T *c2, *c2_0; 01150 T *c1_0, *c1_1; 01151 T *ker; 01152 intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0); 01153 intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1); 01154 intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1); 01155 T *d1_0, *d1, f; 01156 intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0); 01157 intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1); 01158 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01159 c1_0 = i1.idx_ptr(); 01160 ker = i2.idx_ptr(); 01161 d1_0 = o1.idx_ptr(); 01162 for (i=0; i<imax; i++) { 01163 c1_1 = c1_0; 01164 d1 = d1_0; 01165 for (j=0; j<jmax; j++) { 01166 f = 0; 01167 c1_2 = c1_1; 01168 c2_0 = ker; 01169 for (k=0; k<kmax; k++) { 01170 c1 = c1_2; 01171 c2 = c2_0; 01172 for (l=0; l<lmax; l++) { 01173 f += (*c1)*(*c2); 01174 c1 += c1_m3; 01175 c2 += c2_m1; 01176 } 01177 c1_2 += c1_m2; 01178 c2_0 += c2_m0; 01179 } 01180 *d1 = f; 01181 d1 += d1_m1; 01182 c1_1 += c1_m1; 01183 } 01184 d1_0 += d1_m0; 01185 c1_0 += c1_m0; 01186 } 01187 } 01188 01189 // TODO-0 write specialized blas version in cpp 01190 template<class T> void idx_m4dotm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01191 idx_checkorder3(i1, 4, i2, 2, o1, 2); // check for compatible orders 01192 if ((i1.dim(0) != o1.dim(0)) || (i1.dim(1) != o1.dim(1)) 01193 || (i1.dim(2) != i2.dim(0)) || (i1.dim(3) != i2.dim(1))) 01194 idx_compatibility_error3(i1, i2, o1, "incompatible dimensions"); 01195 T *c1, *c1_2; 01196 T *c2, *c2_0; 01197 T *c1_0, *c1_1; 01198 T *ker; 01199 intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0); 01200 intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1); 01201 intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1); 01202 T *d1_0, *d1, f; 01203 intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0); 01204 intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1); 01205 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01206 c1_0 = i1.idx_ptr(); 01207 ker = i2.idx_ptr(); 01208 d1_0 = o1.idx_ptr(); 01209 for (i=0; i<imax; i++) { 01210 c1_1 = c1_0; 01211 d1 = d1_0; 01212 for (j=0; j<jmax; j++) { 01213 f = *d1; 01214 c1_2 = c1_1; 01215 c2_0 = ker; 01216 for (k=0; k<kmax; k++) { 01217 c1 = c1_2; 01218 c2 = c2_0; 01219 for (l=0; l<lmax; l++) { 01220 f += (*c1)*(*c2); 01221 c1 += c1_m3; 01222 c2 += c2_m1; 01223 } 01224 c1_2 += c1_m2; 01225 c2_0 += c2_m0; 01226 } 01227 *d1 = f; 01228 d1 += d1_m1; 01229 c1_1 += c1_m1; 01230 } 01231 d1_0 += d1_m0; 01232 c1_0 += c1_m0; 01233 } 01234 } 01235 01236 template<class T> void idx_m4squdotm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01237 idx_checkorder3(i1, 4, i2, 2, o1, 2); 01238 T *c1, *c1_2; 01239 T *c2, *c2_0; 01240 T *c1_0, *c1_1; 01241 T *ker; 01242 intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0); 01243 intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1); 01244 intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1); 01245 T *d1_0, *d1, f; 01246 intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0); 01247 intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1); 01248 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01249 c1_0 = i1.idx_ptr(); 01250 ker = i2.idx_ptr(); 01251 d1_0 = o1.idx_ptr(); 01252 for (i=0; i<imax; i++) { 01253 c1_1 = c1_0; 01254 d1 = d1_0; 01255 for (j=0; j<jmax; j++) { 01256 f = *d1; 01257 c1_2 = c1_1; 01258 c2_0 = ker; 01259 for (k=0; k<kmax; k++) { 01260 c1 = c1_2; 01261 c2 = c2_0; 01262 for (l=0; l<lmax; l++) { 01263 f += (*c1)*(*c1)*(*c2); 01264 c1 += c1_m3; 01265 c2 += c2_m1; 01266 } 01267 c1_2 += c1_m2; 01268 c2_0 += c2_m0; 01269 } 01270 *d1 = f; 01271 d1 += d1_m1; 01272 c1_1 += c1_m1; 01273 } 01274 d1_0 += d1_m0; 01275 c1_0 += c1_m0; 01276 } 01277 } 01278 01279 template<class T> void idx_m2extm2(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01280 idx_checkorder3(i1, 2, i2, 2, o1, 4); 01281 T *c2_0, *c2_1; 01282 T *d1_2, *d1_3; 01283 T *d1_0, *d1_1; 01284 T *c1_0, *c1_1; 01285 T *ker; 01286 intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1); 01287 intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3); 01288 intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1); 01289 intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1); 01290 intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2); 01291 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01292 c1_0 = i1.idx_ptr(); 01293 ker = i2.idx_ptr(); 01294 d1_0 = o1.idx_ptr(); 01295 for (i=0; i<imax; i++) { 01296 d1_1 = d1_0; 01297 c1_1 = c1_0; 01298 for (j=0; j<jmax; j++) { 01299 d1_2 = d1_1; 01300 c2_0 = ker; 01301 for (k=0; k<kmax; k++) { 01302 d1_3 = d1_2; 01303 c2_1 = c2_0; 01304 for (l=0; l<lmax; l++) { 01305 *d1_3 = (*c1_1)*(*c2_1); 01306 d1_3 += d1_m3; 01307 c2_1 += c2_m1; 01308 } 01309 d1_2 += d1_m2; 01310 c2_0 += c2_m0; 01311 } 01312 d1_1 += d1_m1; 01313 c1_1 += c1_m1; 01314 } 01315 d1_0 += d1_m0; 01316 c1_0 += c1_m0; 01317 } 01318 } 01319 01320 template<class T> void idx_m2extm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01321 idx_checkorder3(i1, 2, i2, 2, o1, 4); 01322 T *c2_0, *c2_1; 01323 T *d1_2, *d1_3; 01324 T *d1_0, *d1_1; 01325 T *c1_0, *c1_1; 01326 T *ker; 01327 intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1); 01328 intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3); 01329 intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1); 01330 intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1); 01331 intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2); 01332 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01333 c1_0 = i1.idx_ptr(); 01334 ker = i2.idx_ptr(); 01335 d1_0 = o1.idx_ptr(); 01336 for (i=0; i<imax; i++) { 01337 d1_1 = d1_0; 01338 c1_1 = c1_0; 01339 for (j=0; j<jmax; j++) { 01340 d1_2 = d1_1; 01341 c2_0 = ker; 01342 for (k=0; k<kmax; k++) { 01343 d1_3 = d1_2; 01344 c2_1 = c2_0; 01345 for (l=0; l<lmax; l++) { 01346 *d1_3 += (*c1_1)*(*c2_1); 01347 d1_3 += d1_m3; 01348 c2_1 += c2_m1; 01349 } 01350 d1_2 += d1_m2; 01351 c2_0 += c2_m0; 01352 } 01353 d1_1 += d1_m1; 01354 c1_1 += c1_m1; 01355 } 01356 d1_0 += d1_m0; 01357 c1_0 += c1_m0; 01358 } 01359 } 01360 01361 template<class T> void idx_m2squextm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01362 idx_checkorder3(i1, 2, i2, 2, o1, 4); 01363 T *c2_0, *c2_1; 01364 T *d1_2, *d1_3; 01365 T *d1_0, *d1_1; 01366 T *c1_0, *c1_1; 01367 T *ker; 01368 intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1); 01369 intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3); 01370 intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1); 01371 intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1); 01372 intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2); 01373 intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); 01374 c1_0 = i1.idx_ptr(); 01375 ker = i2.idx_ptr(); 01376 d1_0 = o1.idx_ptr(); 01377 for (i=0; i<imax; i++) { 01378 d1_1 = d1_0; 01379 c1_1 = c1_0; 01380 for (j=0; j<jmax; j++) { 01381 d1_2 = d1_1; 01382 c2_0 = ker; 01383 for (k=0; k<kmax; k++) { 01384 d1_3 = d1_2; 01385 c2_1 = c2_0; 01386 for (l=0; l<lmax; l++) { 01387 *d1_3 += (*c1_1)*(*c2_1)*(*c2_1); 01388 d1_3 += d1_m3; 01389 c2_1 += c2_m1; 01390 } 01391 d1_2 += d1_m2; 01392 c2_0 += c2_m0; 01393 } 01394 d1_1 += d1_m1; 01395 c1_1 += c1_m1; 01396 } 01397 d1_0 += d1_m0; 01398 c1_0 += c1_m0; 01399 } 01400 } 01401 01402 // Fu Jie Huang, May 20, 2008 01403 template<typename T> void idx_m2squdotm2(idx<T>& i1, idx<T>& i2, idx<T>& o) { 01404 idx_checkorder3(i1, 2, i2, 2, o, 0); 01405 idx_checkdim2(i1, 0, i2.dim(0), i1, 1, i2.dim(1)); 01406 intg imax = i1.dim(0), jmax = i2.dim(1); 01407 intg c1_m0 = i1.mod(0), c2_m0 = i2.mod(0); 01408 intg c1_m1 = i1.mod(1), c2_m1 = i2.mod(1); 01409 01410 T *c1_0 = i1.idx_ptr(); 01411 T *c2_0 = i2.idx_ptr(); 01412 T *d1 = o.idx_ptr(); 01413 T *c1, *c2; 01414 01415 T f = 0; 01416 for(int i=0; i<imax; ++i) { 01417 c1 = c1_0; 01418 c2 = c2_0; 01419 for(int j=0; j<jmax; ++j) { 01420 f += (*c1) * (*c1) * (*c2); // only difference 01421 c1 += c1_m1; 01422 c2 += c2_m1; 01423 } 01424 c1_0 += c1_m0; 01425 c2_0 += c2_m0; 01426 } 01427 *d1 = f; 01428 } 01429 01430 // Fu Jie Huang, May 20, 2008 01431 template<typename T> 01432 void idx_m2squdotm2acc(idx<T>& i1, idx<T>& i2, idx<T>& o) { 01433 idx_checkorder3(i1, 2, i2, 2, o, 0); 01434 idx_checkdim2(i1, 0, i2.dim(0), i1, 1, i2.dim(1)); 01435 intg imax = i1.dim(0), jmax = i2.dim(1); 01436 intg c1_m0 = i1.mod(0), c2_m0 = i2.mod(0); 01437 intg c1_m1 = i1.mod(1), c2_m1 = i2.mod(1); 01438 01439 T *c1_0 = i1.idx_ptr(); 01440 T *c2_0 = i2.idx_ptr(); 01441 T *d1 = o.idx_ptr(); 01442 T *c1, *c2; 01443 01444 T f = *d1; // only difference: accumulate 01445 for(int i=0; i<imax; ++i) { 01446 c1 = c1_0; 01447 c2 = c2_0; 01448 for(int j=0; j<jmax; ++j) { 01449 f += (*c1) * (*c1) * (*c2); 01450 c1 += c1_m1; 01451 c2 += c2_m1; 01452 } 01453 c1_0 += c1_m0; 01454 c2_0 += c2_m0; 01455 } 01456 *d1 = f; 01457 } 01458 01460 // idx_m2oversample 01461 01462 // Fu Jie Huang, May 20, 2008 01463 template<typename T> 01464 void idx_m2oversample(idx<T>& small, intg nlin, intg ncol, idx<T>& big) { 01465 idx<T> uin = big.unfold(0, nlin, nlin); 01466 idx<T> uuin = uin.unfold(1, ncol, ncol); 01467 idx_eloop1(z1, uuin, T) { 01468 idx_eloop1(z2, z1, T) { 01469 idx_copy(small, z2); 01470 } 01471 } 01472 } 01473 01474 template<typename T> 01475 void idx_m2oversampleacc(idx<T>& small, intg nlin, intg ncol, idx<T>& big) { 01476 idx<T> uin = big.unfold(0, nlin, nlin); 01477 idx<T> uuin = uin.unfold(1, ncol, ncol); 01478 idx_eloop1(z1, uuin, T) { 01479 idx_eloop1(z2, z1, T) { 01480 idx_add(small, z2, z2); 01481 } 01482 } 01483 } 01484 01486 // idx_max 01487 01488 template <class T> T idx_max(idx<T> &m) { 01489 T v = *(m.idx_ptr()); 01490 idx_aloopf1(i, m, T, { if (*i > v) v = *i; }); 01491 return v; 01492 } 01493 01494 // idx_max (between 2 idx's, in-place) 01495 template <class T> void idx_max(idx<T> &in1, idx<T> &in2) { 01496 idx_aloopf2(i1, in1, T, i2, in2, T, { *i2 = std::max(*i1, *i2); }); 01497 } 01498 01499 // idx_max (between 2 idx's, not-in-place) 01500 template <class T> void idx_max(idx<T> &in1, idx<T> &in2, idx<T> &out) { 01501 idx_aloopf3(i1, in1, T, i2, in2, T, o, out, T, { *o = std::max(*i1, *i2);}); 01502 } 01503 01505 // idx_min 01506 01507 template <class T> T idx_min(idx<T> &m) { 01508 T v = *(m.idx_ptr()); 01509 idx_aloopf1(i, m, T, { if (*i < v) v = *i; }); 01510 return v; 01511 } 01512 01513 // idx_min (between 2 idx's, in-place) 01514 template <class T> void idx_min(idx<T> &in1, idx<T> &in2) { 01515 idx_aloopf2(i1, in1, T, i2, in2, T, { *i2 = std::min(*i1, *i2); }); 01516 } 01517 01519 // idx_indexmax 01520 01521 template<class T> intg idx_indexmax(idx<T> &m) { 01522 intg i = 0, imax = 0; 01523 T v = *(m.idx_ptr()); 01524 idx_aloopf1(me, m, T, { 01525 if (*me > v) { 01526 v = *me; 01527 imax = i; 01528 } 01529 i++; 01530 }); 01531 return imax; 01532 } 01533 01535 // idx_indexmin 01536 01537 template<class T> intg idx_indexmin(idx<T> &m) { 01538 intg i = 0, imin = 0; 01539 T v = *(m.idx_ptr()); 01540 idx_aloopf1(me, m, T, { 01541 if (*me < v) { 01542 v = *me; 01543 imin = i; 01544 } 01545 i++; 01546 }); 01547 return imin; 01548 } 01549 01551 // idx_sortdown 01552 01553 template<class T1, class T2> void idx_sortdown(idx<T1> &m, idx<T2> &p) { 01554 idx_checkorder2(m, 1, p, 1); 01555 if (m.mod(0) != 1) eblerror("idx_sortdown: vector is not contiguous"); 01556 if (p.mod(0) != 1) eblerror("idx_sortdown: vector is not contiguous"); 01557 intg n = m.dim(0); 01558 intg z = p.dim(0); 01559 if (n != z) eblerror("idx_sortdown: vectors have different sizes"); 01560 if (n > 1) { 01561 int l,j,ir,i; 01562 T1 *ra, rra; 01563 T2 *rb, rrb; 01564 01565 ra = (T1*)m.idx_ptr() -1; 01566 rb = (T2*)p.idx_ptr() -1; 01567 01568 l = (n >> 1) + 1; 01569 ir = n; 01570 for (;;) { 01571 if (l > 1) { 01572 rra=ra[--l]; 01573 rrb=rb[l]; 01574 } else { 01575 rra=ra[ir]; 01576 rrb=rb[ir]; 01577 ra[ir]=ra[1]; 01578 rb[ir]=rb[1]; 01579 if (--ir == 1) { 01580 ra[1]=rra; 01581 rb[1]=rrb; 01582 return ; } } 01583 i=l; 01584 j=l << 1; 01585 while (j <= ir) { 01586 if (j < ir && ra[j] > ra[j+1]) ++j; 01587 if (rra > ra[j]) { 01588 ra[i]=ra[j]; 01589 rb[i]=rb[j]; 01590 j += (i=j); 01591 } else j=ir+1; } 01592 ra[i]=rra; 01593 rb[i]=rrb; 01594 } 01595 } 01596 } 01597 01600 template<class T> void idx_sortup(idx<T> &m) { 01601 idx_checkorder1(m, 1); 01602 idx_check_contiguous1(m); 01603 intg n = m.dim(0); 01604 if (n > 1) { 01605 int l,j,ir,i; 01606 T *ra, rra; 01607 01608 ra = m.idx_ptr() - 1; 01609 l=(n >> 1)+1; 01610 ir= n; 01611 for (;;) { 01612 if (l > 1) 01613 rra=ra[--l]; 01614 else { 01615 rra=ra[ir]; 01616 ra[ir]=ra[1]; 01617 if (--ir == 1) { 01618 ra[1]=rra; 01619 goto end; 01620 } 01621 } 01622 i=l; 01623 j=l << 1; 01624 while (j <= ir) { 01625 if (j < ir && ra[j] < ra[j+1]) ++j; 01626 if (rra < ra[j]) { 01627 ra[i]=ra[j]; 01628 j += (i=j); 01629 } else j=ir+1; 01630 } 01631 ra[i]=rra; 01632 } 01633 } 01634 end:; 01635 } 01636 01639 template<class T1, class T2> void idx_sortup(idx<T1> &m, idx<T2> &m2) { 01640 idx_checkorder2(m, 1, m2, 1); 01641 idx_check_contiguous2(m, m2); 01642 idx_checknelems2_all(m, m2); // they must have same number of elements 01643 intg n = m.dim(0); 01644 if (n > 1) { 01645 int l,j,ir,i; 01646 T1 *ra, rra; 01647 T2 *ra2, rra2; 01648 01649 ra = m.idx_ptr() - 1; 01650 ra2 = m2.idx_ptr() - 1; 01651 l=(n >> 1)+1; 01652 ir= n; 01653 for (;;) { 01654 if (l > 1) { 01655 rra=ra[--l]; 01656 rra2=ra2[l]; 01657 } else { 01658 rra=ra[ir]; 01659 rra2=ra2[ir]; 01660 ra[ir]=ra[1]; 01661 ra2[ir]=ra2[1]; 01662 if (--ir == 1) { 01663 ra[1]=rra; 01664 ra2[1]=rra2; 01665 goto end; 01666 } 01667 } 01668 i=l; 01669 j=l << 1; 01670 while (j <= ir) { 01671 if (j < ir && ra[j] < ra[j+1]) ++j; 01672 if (rra < ra[j]) { 01673 ra[i]=ra[j]; 01674 ra2[i]=ra2[j]; 01675 j += (i=j); 01676 } else j=ir+1; 01677 } 01678 ra[i]=rra; 01679 ra2[i]=rra2; 01680 } 01681 } 01682 end:; 01683 } 01684 01687 template<class T1, class T2, class T3> void idx_sortup(idx<T1> &m, idx<T2> &m2, 01688 idx<T3> &m3) { 01689 idx_checkorder3(m, 1, m2, 1, m3, 1); 01690 idx_check_contiguous3(m, m2, m3); 01691 idx_checknelems3_all(m, m2, m3); // they must have same number of elements 01692 intg n = m.dim(0); 01693 if (n > 1) { 01694 int l,j,ir,i; 01695 T1 *ra, rra; 01696 T2 *ra2, rra2; 01697 T3 *ra3, rra3; 01698 01699 ra = m.idx_ptr() - 1; 01700 ra2 = m2.idx_ptr() - 1; 01701 ra3 = m3.idx_ptr() - 1; 01702 l=(n >> 1)+1; 01703 ir = n; 01704 for (;;) { 01705 if (l > 1) { 01706 rra=ra[--l]; 01707 rra2=ra2[l]; 01708 rra3=ra3[l]; 01709 } else { 01710 rra=ra[ir]; 01711 rra2=ra2[ir]; 01712 rra3=ra3[ir]; 01713 ra[ir]=ra[1]; 01714 ra2[ir]=ra2[1]; 01715 ra3[ir]=ra3[1]; 01716 if (--ir == 1) { 01717 ra[1]=rra; 01718 ra2[1]=rra2; 01719 ra3[1]=rra3; 01720 goto end; 01721 } 01722 } 01723 i=l; 01724 j=l << 1; 01725 while (j <= ir) { 01726 if (j < ir && ra[j] < ra[j+1]) ++j; 01727 if (rra < ra[j]) { 01728 ra[i]=ra[j]; 01729 ra2[i]=ra2[j]; 01730 ra3[i]=ra3[j]; 01731 j += (i=j); 01732 } else j=ir+1; 01733 } 01734 ra[i]=rra; 01735 ra2[i]=rra2; 01736 ra3[i]=rra3; 01737 } 01738 } 01739 end:; 01740 } 01741 01742 template<class T> void idx_sortdown(idx<T> &m) { 01743 idx_checkorder1(m, 1); 01744 idx_check_contiguous1(m); 01745 intg n = m.dim(0); 01746 if (n > 1) { 01747 int l,j,ir,i; 01748 T *ra, rra; 01749 01750 ra = m.idx_ptr() - 1; 01751 l=(n >> 1)+1; 01752 ir= n; 01753 for (;;) { 01754 if (l > 1) 01755 rra=ra[--l]; 01756 else { 01757 rra=ra[ir]; 01758 ra[ir]=ra[1]; 01759 if (--ir == 1) { 01760 ra[1]=rra; 01761 goto end; }} 01762 i=l; 01763 j=l << 1; 01764 while (j <= ir) { 01765 if (j < ir && ra[j] > ra[j+1]) ++j; 01766 if (rra > ra[j]) { 01767 ra[i]=ra[j]; 01768 j += (i=j); 01769 } else j=ir+1; } 01770 ra[i]=rra; 01771 } 01772 } 01773 end:; 01774 } 01775 01777 // idx_sqrdist 01778 01779 template<class T> float64 idx_sqrdist(idx<T> &i1, idx<T> &i2) { 01780 idx_checknelems2_all(i1, i2); 01781 float64 z = 0; 01782 float64 tmp; 01783 idx_aloopf2(pi1, i1, T, pi2, i2, T, { 01784 tmp = (float64)(*pi1) - (float64)(*pi2); 01785 z += tmp * tmp; 01786 }); 01787 return z; 01788 } 01789 01791 // idx_sqrdist (with idx out) 01792 01793 template<class T> void idx_sqrdist(idx<T> &i1, idx<T> &i2, idx<T> &out) { 01794 idx_checknelems2_all(i1, i2); 01795 if (out.order() != 0) eblerror("idx_sqrdist: expecting an idx of order 0"); 01796 out.set((T)idx_sqrdist(i1, i2)); 01797 } 01798 01800 // idx_spherical_sqrdist 01801 01802 template<class T> float64 idx_spherical_sqrdist(idx<T> &i1, idx<T> &i2) { 01803 idx_checknelems2_all(i1, i2); 01804 float64 z = 0; 01805 float64 tmp, tmpabs, tmpabs2; 01806 idx_aloopf2(pi1, i1, T, pi2, i2, T, { 01807 tmp = fmod((float64)(*pi1) - (float64)(*pi2), TWOPI); 01808 tmpabs = fabs(tmp); 01809 tmpabs2 = TWOPI - tmpabs; 01810 if (tmpabs > tmpabs2) 01811 tmp = tmpabs2; 01812 z += tmp * tmp; 01813 }); 01814 return z; 01815 } 01816 01818 // idx_gaussian 01819 01820 template <typename T> 01821 void idx_gaussian(idx<T> &in, double m, double sigma, idx<T> &out) { 01822 idx_aloopf2(i, in, T, o, out, T, { 01823 *o = (T) gaussian((double) (*i), m, sigma); 01824 }); 01825 } 01826 01828 // idx_modulo 01829 01830 template <class T> void idx_modulo(idx<T> &m, T mod) { 01831 idx_aloopf1(e, m, T, { *e = *e % mod; }); 01832 } 01833 01835 // idx_exp 01836 01837 template <class T> void idx_exp(idx<T> &m) { 01838 // note: we use float32 instead of 64 because float64 has its own 01839 // specialized template, therefore, calls to this generic template 01840 // will be with types of lower precision than float32, no need for 01841 // float64 precision. 01842 idx_aloopf1(i, m, T, { *i = saturate(exp((float32)*i), T); }); 01843 } 01844 01846 // idx_log 01847 01848 template <class T> void idx_log(idx<T> &m) { 01849 #if USING_FAST_ITERS == 0 01850 idx_aloop1(i, m, T) { 01851 *i = log(*i); 01852 }; 01853 #else 01854 idx_aloopf1(i, m, T, { *i = (T)log(*i); }); 01855 #endif 01856 } 01857 01859 // idx_dot 01860 01861 template <class T> T idx_dot(idx<T> & in1, idx<T> & in2) { 01862 T z = 0; 01863 #if USING_FAST_ITERS == 0 01864 { idx_aloop2(pi1, in1, T, pi2, in2, T) { 01865 z += *pi1 * *pi2; 01866 } 01867 } 01868 #else 01869 idx_aloopf2(pi1, in1, T, pi2, in2, T, { 01870 z += *pi1 * *pi2; 01871 }); 01872 #endif 01873 return z; 01874 } 01875 01876 // TODO: this doesn't compile on newest gcc 01877 // template <class T> float64 idx_dot(idx<T> & in1, idx<T> & in2) { 01878 // float64 z = 0; 01879 // #if USING_FAST_ITERS == 0 01880 // { idx_aloop2(pi1, in1, T, pi2, in2, T) { 01881 // z += ((float64)(*pi1)) * ((float64)(*pi2)); 01882 // } 01883 // } 01884 // #else 01885 // idx_aloopf2(pi1, in1, T, pi2, in2, T, { 01886 // z += ((float64)(*pi1)) * ((float64)(*pi2)); 01887 // }); 01888 // #endif 01889 // return z; 01890 // } 01891 01893 // idx_dotacc 01894 01895 template<class T> void idx_dotacc(idx<T> &i1, idx<T> &i2, idx<T> &o) { 01896 if (o.order() != 0) 01897 eblerror("output idx should have order 0"); 01898 o.set(o.get() + (T) idx_dot(i1, i2)); 01899 } 01900 01902 // idx_clip 01903 01904 template<class T> void idx_clip(idx<T> &i1, T m, idx<T> &o1) { 01905 idx_checknelems2_all(i1, o1); 01906 #if USING_FAST_ITERS == 0 01907 { idx_aloop2(i, i1, T, o, o1, T) { 01908 *o = (std::max)(m, *i); 01909 }} 01910 #else 01911 idx_aloopf2(i, i1, T, o, o1, T, { *o = std::max(m, *i); }); 01912 #endif 01913 } 01914 01916 // idx_2dconvol 01917 01918 template<class T> void idx_2dconvol(idx<T> &in, idx<T> &kernel, idx<T> &out) { 01919 idx_checkorder3(in, 2, kernel, 2, out, 2); 01920 idx<T> uin(in.unfold(0, kernel.dim(0), 1)); 01921 uin = uin.unfold(1, kernel.dim(1), 1); 01922 idx_m4dotm2(uin, kernel, out); 01923 } 01924 01925 template<class T> void idx_3dconvol(idx<T> &in, idx<T> &kernel, idx<T> &out) { 01926 idx_bloop2(i, in, T, o, out, T) { 01927 idx_2dconvol(i, kernel, o); 01928 } 01929 } 01930 01931 template <typename T> 01932 void idx_m1extm1(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01933 T *c2, *d1, *c1, *c2_0, *d1_0; 01934 intg c2_m0 = i2.mod(0), d1_m1 = o1.mod(1); 01935 intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0); 01936 intg j, jmax = o1.dim(1); 01937 intg i, imax = o1.dim(0); 01938 c1 = i1.idx_ptr(); 01939 c2_0 = i2.idx_ptr(); 01940 d1_0 = o1.idx_ptr(); 01941 for (i=0; i<imax; i++) { 01942 d1 = d1_0; 01943 c2 = c2_0; 01944 for (j=0; j<jmax; j++) { 01945 *d1 = (*c1)*(*c2); 01946 d1 += d1_m1; 01947 c2 += c2_m0; 01948 } 01949 d1_0 += d1_m0; 01950 c1 += c1_m0; 01951 } 01952 } 01953 01954 template <typename T> 01955 void idx_m1extm1acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) { 01956 T *c2, *d1, *c1, *c2_0, *d1_0; 01957 intg c2_m0 = i2.mod(0), d1_m1 = o1.mod(1); 01958 intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0); 01959 intg j, jmax = o1.dim(1); 01960 intg i, imax = o1.dim(0); 01961 c1 = i1.idx_ptr(); 01962 c2_0 = i2.idx_ptr(); 01963 d1_0 = o1.idx_ptr(); 01964 for (i=0; i<imax; i++) { 01965 d1 = d1_0; 01966 c2 = c2_0; 01967 for (j=0; j<jmax; j++) { 01968 *d1 += (*c1)*(*c2); 01969 d1 += d1_m1; 01970 c2 += c2_m0; 01971 } 01972 d1_0 += d1_m0; 01973 c1 += c1_m0; 01974 } 01975 } 01976 01977 template <typename T> 01978 void idx_m2squdotm1(idx<T> &a, idx<T> &x, idx<T> &y) { 01979 eblerror("not implemented"); 01980 } 01981 01982 template <typename T> 01983 void idx_m2squdotm1acc(idx<T> &a, idx<T> &x, idx<T> &y) { 01984 eblerror("not implemented"); 01985 } 01986 01987 template <typename T> 01988 void idx_m1squextm1(idx<T> &x, idx<T> &y, idx<T> &a) { 01989 eblerror("not implemented"); 01990 } 01991 01992 template <typename T> 01993 void idx_m1squextm1acc(idx<T> &x, idx<T> &y, idx<T> &a) { 01994 eblerror("not implemented"); 01995 } 01996 01997 template <typename T> 01998 T idx_trace(idx<T> &m2) { 01999 idx_checkorder1(m2, 2); // must be 2D 02000 if (m2.dim(0) != m2.dim(1)) 02001 eblerror("Expected a square matrix, but found: " << m2); 02002 T sum = 0; 02003 for (uint i = 0; i < m2.dim(0); ++i) 02004 sum += m2.get(i, i); 02005 return sum; 02006 } 02007 02009 // concatenation 02010 02011 template <typename T> 02012 idx<T> idx_concat(idx<T> &m1, idx<T> &m2, intg dim) { 02013 idxdim d(m1); 02014 // m1 and m2 must have the same order 02015 idx_checkorder2(m1, d.order(), m2, d.order()); 02016 // check that all dimensions have the same size except for 02017 for (intg i = 0; i < d.order(); ++i) { 02018 if (i != dim && m1.dim(i) != m2.dim(i)) 02019 eblerror("expected same dimensions except for dim " << dim 02020 << " but found differences: " << m1 << " and " << m2); 02021 } 02022 // allocate and copy 02023 d.setdim(dim, m1.dim(dim) + m2.dim(dim)); 02024 idx<T> m3(d); 02025 idx<T> tmp = m3.narrow(dim, m1.dim(dim), 0); 02026 idx_copy(m1, tmp); 02027 tmp = m3.narrow(dim, m2.dim(dim), m1.dim(dim)); 02028 idx_copy(m2, tmp); 02029 return m3; 02030 } 02031 02032 // randomization //////////////////////////////////////////////////////////// 02033 02034 template <typename T> 02035 void idx_random(idx<T> &m, double v0, double v1) { 02036 idx_aloopf1(mm, m, T, { *mm = (T) drand(v0, v1); }); 02037 } 02038 02039 template <typename T> 02040 void idx_random(idx<T> &m, double v) { 02041 idx_aloopf1(mm, m, T, { *mm = (T) drand(v); }); 02042 } 02043 02044 } // end namespace ebl 02045 02046 /* 02047 #ifdef __OPENMP__ 02048 #include "idxops_openmp.hpp" 02049 #endif 02050 */ 02051 02052 #endif /* IDXOPS_HPP */