libidx
/home/rex/ebltrunk/core/libidx/include/idxops.hpp
00001 /***************************************************************************
00002  *   Copyright (C) 2008 by Yann LeCun and Pierre Sermanet *
00003  *   yann@cs.nyu.edu, pierre.sermanet@gmail.com *
00004  *   All rights reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions are met:
00008  *     * Redistributions of source code must retain the above copyright
00009  *       notice, this list of conditions and the following disclaimer.
00010  *     * Redistributions in binary form must reproduce the above copyright
00011  *       notice, this list of conditions and the following disclaimer in the
00012  *       documentation and/or other materials provided with the distribution.
00013  *     * Redistribution under a license not approved by the Open Source
00014  *       Initiative (http://www.opensource.org) must display the
00015  *       following acknowledgement in all advertising material:
00016  *        This product includes software developed at the Courant
00017  *        Institute of Mathematical Sciences (http://cims.nyu.edu).
00018  *     * The names of the authors may not be used to endorse or promote products
00019  *       derived from this software without specific prior written permission.
00020  *
00021  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
00022  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00023  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00024  * DISCLAIMED. IN NO EVENT SHALL ThE AUTHORS BE LIABLE FOR ANY
00025  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00026  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00027  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00028  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00029  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00030  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031  ***************************************************************************/
00032 
00033 #ifndef IDXOPS_HPP
00034 #define IDXOPS_HPP
00035 
00036 using namespace std;
00037 
00038 namespace ebl {
00039 
00041   // idx_copy
00042 
00043   // generic copy for two different types.
00044   //template<class T1, class T2> void idx_copy(idx<T1> &src, idx<T2> &dst) {
00048   //  {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }}
00049   //}
00050 
00052   //template<class T1, class T2> void idx_copy(idx<T1> &src, idx<T2> &dst) {
00056   //  {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }}
00057   //}
00058 
00059 
00060   // generic copy for the same type.
00061   template<class T> void idx_copy(const idx<T> &src, idx<T> &dst) {
00062     idx_checknelems2_all(src, dst);
00063     // loop and copy
00064     if (src.order() == 0 && dst.order() == 0)
00065       *(dst.idx_ptr()) = *(src.idx_ptr());
00066     else if (src.contiguousp() && dst.contiguousp()) {
00067       /* they are both contiguous: call the stride 1 routine */
00068       memcpy(dst.idx_ptr(), src.idx_ptr(), src.nelements() * sizeof(T));
00069     } else {
00070       // else, they don't have the same structure: do it "by hand".
00071       // This is slower
00072       {idx_aloop2(isrc, src, T, idst, dst, T) { *idst = *isrc; }}
00073     }
00074   }
00075 
00076   template<class T1, class T2> void idx_copy(const idx<T1> &src, idx<T2> &dst){
00077     idx_checknelems2_all(src, dst);
00078     // loop and copy
00079     idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }
00080   }
00081 
00082   template<class T1, class T2> idx<T1> idx_copy(const idx<T2> &src){
00083     idx<T1> dst(src.get_idxdim());
00084     idx_copy(src, dst);
00085     return dst;
00086   }
00087 
00088   template<class T> idx<T> idx_copy(const idx<T> &src){
00089     idx<T> dst(src.get_idxdim());
00090     idx_copy(src, dst);
00091     return dst;
00092   }
00093 
00094   template<class T1, class T2>
00095   void idx_copy_clip(const idx<T1> &src, idx<T2> &dst) {
00096     idx_checknelems2_all(src, dst);
00097     // loop and copy
00098     idx_aloop2(isrc, src, T1, idst, dst, T2) {
00099       *idst = saturate(*isrc, T2);
00100     }
00101   }
00102 
00104   // idx_clear
00105 
00106   template<class T> void idx_clear(idx<T> &inp) {
00107     if (inp.contiguousp())
00108       memset(inp.idx_ptr(), 0, inp.nelements() * sizeof (T));
00109     else
00110       idx_aloopf1(pinp, inp, T, { *pinp = 0; });
00111   }
00112 
00114   // idx_fill
00115 
00116   template<class T, class T2> void idx_fill(idx<T> & inp, T2 v) {
00117     idx_fill(inp, saturate(v, T));
00118   }
00119 
00120   template<class T> void idx_fill(idx<T> &inp, T v) {
00121     idx_aloopf1(it, inp, T, {*it = v;})
00122   }
00123 
00124   template<class T> void idx_fill_index(idx<T> &inp) {
00125     T i = 0;
00126     idx_aloopf1(it, inp, T, {*it = i++;})
00127   }
00128 
00130   // idx_shuffle_*
00131 
00132   // TODO: can n random swaps be as random? (it would be more efficient)
00133   template<class T> void idx_shuffle(idx<T> &in_, intg d, idx<T> *out_) {
00134     if (!drand_ini)
00135       cerr << "Warning: random not initialized, call dynamic_init_drand()"
00136            << endl;
00137     // if out exists, use it for output, otherwise create a temporary buffer
00138     // and put output back into in.
00139     idx<T> in, out;
00140     if (out_) { // use in_ as input and out_ as output
00141       // check that in_ and out_ are different
00142       if (&in_ == out_)
00143         eblerror("input and output idx should be different");
00144       idx_checknelems2_all(in_, *out_);
00145       in = in_;
00146       out = *out_;
00147     } else { // otherwise, use in_ as output and a copy of _in as input
00148       idxdim indims(in_);
00149       in = idx<T>(indims);
00150       idx_copy(in_, in);
00151       out = in_;
00152     }
00153     // for each element of in, put it randomly in out.
00154     // if there is a collision, loop until the next available slot
00155     idx<T> tmpi, tmpo;
00156     idx<bool> assigned(in.dim(d));
00157     idx_fill(assigned, false);
00158     intg pos;
00159     for (intg i = 0; i < in.dim(d); ++i) {
00160       pos = (intg) drand(0, in.dim(d) - 1);
00161       if (assigned.get(pos)) { // if already assigned, loop until free slot
00162         for (intg j = pos + 1; j != pos; ++j) {
00163           if (j >= in.dim(d)) j = 0;
00164           if (j == pos) eblerror("idx_shuffle: no available slot");
00165           if (!assigned.get(j)) {
00166             pos = j;
00167             break ;
00168           }
00169         }
00170       }
00171       // copy ith element of in into pos^th element of out
00172       tmpi = in.select(d, i);
00173       tmpo = out.select(d, pos);
00174       idx_copy(tmpi, tmpo);
00175       assigned.set(true, pos);
00176     }
00177   }
00178 
00179   // TODO: can n random swaps be as random? (it would be more efficient)
00180   template<class T1, class T2>
00181   void idx_shuffle_together(idx<T1> &in1_, idx<T2> &in2_, intg d,
00182                             idx<T1> *out1_, idx<T2> *out2_) {
00183     if (!drand_ini)
00184       cerr << "Warning: random not initialized, call dynamic_init_drand()"
00185            << endl;
00186     idx_checkdim2_all(in1_, in2_, d); // size of dim d must match of in1 and in2
00187     // if out exists, use it for output, otherwise create a temporary buffer
00188     // and put output back into in.
00189     idx<T1> in1, out1;
00190     idx<T2> in2, out2;
00191     if (out1_) { // use in_ as input and out_ as output
00192       // check that in_ and out_ are different
00193       if (&in1_ == out1_)
00194         eblerror("input and output idx should be different");
00195       idx_checknelems2_all(in1_, *out1_);
00196       in1 = in1_;
00197       out1 = *out1_;
00198     } else { // otherwise, use in_ as output and a copy of _in as input
00199       idxdim indims(in1_);
00200       in1 = idx<T1>(indims);
00201       idx_copy(in1_, in1);
00202       out1 = in1_;
00203     }
00204     if (out2_) { // use in_ as input and out_ as output
00205       if (&in2_ == out2_)
00206         eblerror("input and output idx should be different");
00207       idx_checknelems2_all(in2_, *out2_);
00208       in2 = in2_;
00209       out2 = *out2_;
00210     } else { // otherwise, use in_ as output and a copy of _in as input
00211       idxdim indims(in2_);
00212       in2 = idx<T2>(indims);
00213       idx_copy(in2_, in2);
00214       out2 = in2_;
00215     }
00216     // for each element of in, put it randomly in out.
00217     // if there is a collision, loop until the next available slot
00218     idx<T1> tmpi1, tmpo1;
00219     idx<T2> tmpi2, tmpo2;
00220     idx<bool> assigned(in1.dim(d));
00221     idx_fill(assigned, false);
00222     intg pos;
00223     for (intg i = 0; i < in1.dim(d); ++i) {
00224       pos = (intg) drand(0, in1.dim(d) - 1);
00225       if (assigned.get(pos)) { // if already assigned, loop until free slot
00226         for (intg j = pos + 1; j != pos; ++j) {
00227           if (j >= in1.dim(d)) j = 0;
00228           if (j == pos) eblerror("no available slot");
00229           if (!assigned.get(j)) {
00230             pos = j;
00231             break ;
00232           }
00233         }
00234       }
00235       // copy ith element of in into pos^th element of out
00236       tmpi1 = in1.select(d, i);
00237       tmpo1 = out1.select(d, pos);
00238       idx_copy(tmpi1, tmpo1);
00239       tmpi2 = in2.select(d, i);
00240       tmpo2 = out2.select(d, pos);
00241       idx_copy(tmpi2, tmpo2);
00242       assigned.set(true, pos);
00243     }
00244   }
00245 
00246   // TODO: can n random swaps be as random? (it would be more efficient)
00247   template<class T1, class T2, class T3>
00248   void idx_shuffle_together(idx<T1> &in1_, idx<T2> &in2_, idx<T3> &in3_,
00249                             intg d,
00250                             idx<T1> *out1_, idx<T2> *out2_, idx<T3> *out3_) {
00251     if (!drand_ini)
00252       cerr << "Warning: random not initialized, call dynamic_init_drand()"
00253            << endl;
00254     // size of dim d must match of in1 and in2 and in3
00255     idx_checkdim3_all(in1_, in2_, in3_, d);
00256     // if out exists, use it for output, otherwise create a temporary buffer
00257     // and put output back into in.
00258     idx<T1> in1, out1;
00259     idx<T2> in2, out2;
00260     idx<T3> in3, out3;
00261     if (out1_) { // use in_ as input and out_ as output
00262       if (&in1_ == out1_)
00263         eblerror("input and output idx should be different");
00264       idx_checknelems2_all(in1_, *out1_);
00265       in1 = in1_;
00266       out1 = *out1_;
00267     } else { // otherwise, use in_ as output and a copy of _in as input
00268       idxdim indims(in1_);
00269       in1 = idx<T1>(indims);
00270       idx_copy(in1_, in1);
00271       out1 = in1_;
00272     }
00273     if (out2_) { // use in_ as input and out_ as output
00274       if (&in2_ == out2_)
00275         eblerror("input and output idx should be different");
00276       idx_checknelems2_all(in2_, *out2_);
00277       in2 = in2_;
00278       out2 = *out2_;
00279     } else { // otherwise, use in_ as output and a copy of _in as input
00280       idxdim indims(in2_);
00281       in2 = idx<T2>(indims);
00282       idx_copy(in2_, in2);
00283       out2 = in2_;
00284     }
00285     if (out3_) { // use in_ as input and out_ as output
00286       if (&in3_ == out3_)
00287         eblerror("input and output idx should be different");
00288       idx_checknelems2_all(in3_, *out3_);
00289       in3 = in3_;
00290       out3 = *out3_;
00291     } else { // otherwise, use in_ as output and a copy of _in as input
00292       idxdim indims(in3_);
00293       in3 = idx<T3>(indims);
00294       idx_copy(in3_, in3);
00295       out3 = in3_;
00296     }
00297     // for each element of in, put it randomly in out.
00298     // if there is a collision, loop until the next available slot
00299     idx<T1> tmpi1, tmpo1;
00300     idx<T2> tmpi2, tmpo2;
00301     idx<T3> tmpi3, tmpo3;
00302     idx<bool> assigned(in1.dim(d));
00303     idx_fill(assigned, false);
00304     intg pos;
00305     for (intg i = 0; i < in1.dim(d); ++i) {
00306       pos = (intg) drand(0, in1.dim(d) - 1);
00307       if (assigned.get(pos)) { // if already assigned, loop until free slot
00308         for (intg j = pos + 1; j != pos; ++j) {
00309           if (j >= in1.dim(d)) j = 0;
00310           if (j == pos) eblerror("no available slot");
00311           if (!assigned.get(j)) {
00312             pos = j;
00313             break ;
00314           }
00315         }
00316       }
00317       // copy ith element of in into pos^th element of out
00318       tmpi1 = in1.select(d, i);
00319       tmpo1 = out1.select(d, pos);
00320       idx_copy(tmpi1, tmpo1);
00321       tmpi2 = in2.select(d, i);
00322       tmpo2 = out2.select(d, pos);
00323       idx_copy(tmpi2, tmpo2);
00324       tmpi3 = in3.select(d, i);
00325       tmpo3 = out3.select(d, pos);
00326       idx_copy(tmpi3, tmpo3);
00327       assigned.set(true, pos);
00328     }
00329   }
00330 
00332   // idx_minus
00333 
00334   template<class T> void idx_minus(idx<T> &inp, idx<T> &out) {
00335     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = - *pinp; });
00336   }
00337 
00339   // idx_minus_acc
00340 
00341   template<class T> void idx_minus_acc(idx<T> &inp, idx<T> &out) {
00342     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += - *pinp; });
00343   }
00344 
00346   // idx_inv
00347 
00348   template<class T> void idx_inv(idx<T> &inp, idx<T> &out) {
00349 #ifdef __DEBUG__
00350     idx_aloopf2(pinp, inp, T, pout, out, T, {
00351         if (*pinp == 0) eblerror("division by zero");
00352         *pout = 1 / *pinp;
00353       });
00354 #else
00355     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = 1 / *pinp; });
00356 #endif
00357   }
00358 
00360   // idx_add
00361 
00362   template<class T> void idx_add(idx<T> &in, idx<T> &out) {
00363     idx_aloopf2 (pin, in, T, pout, out, T, { *pout = *pout + *pin; });
00364   }
00365 
00366   template<typename T> void idx_add(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00367     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, { *pout = *pi1 + *pi2; });
00368   }
00369 
00371   // idx_sub
00372 
00373   template<class T> void idx_sub(idx<T> &i1, idx<T> &i2) {
00374     idx_aloopf2(pi1, i1, T, pi2, i2, T, { *pi1 = *pi1 - *pi2; });
00375   }
00376 
00377   template<class T> void idx_sub(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00378     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {*pout = *pi1 - *pi2; });
00379   }
00380 
00381   template<class T> void idx_subacc(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00382     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {*pout += *pi1 - *pi2; });
00383   }
00384 
00386   // idx_spherical_sub
00387 
00388   template<class T> void idx_spherical_sub(idx<T> &i1, idx<T> &i2, idx<T> &out){
00389     T tmp, tmpabs, tmpabs2;
00390     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00391         tmp = *pi1 - *pi2;
00392         tmpabs = abs(tmp);
00393         tmpabs2 = TWOPI - tmpabs;
00394         if (tmpabs > tmpabs2) {
00395           if (tmp < 0)
00396             tmp = tmpabs2;
00397           else
00398             tmp = -tmpabs2;
00399         }
00400         *pout = tmp;
00401       });
00402   }
00403 
00405   // idx_spherical_add
00406 
00407   template<class T> void idx_spherical_add(idx<T> &i1, idx<T> &i2, idx<T> &out){
00408     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00409         *pout = std::min(TWOPI - *pi1, *pi1) + std::min(TWOPI - *pi2, - *pi2);
00410       });
00411   }
00412 
00414   // idx_mul
00415 
00416   template<class T> void idx_mul(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00417     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T,
00418                 {*pout = (*pi1) * (*pi2); });
00419   }
00420 
00421   template<class T> void idx_mulacc(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00422     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T,
00423                 {*pout += (*pi1) * (*pi2); });
00424   }
00425 
00427   // idx_div
00428 
00429   template<class T> void idx_div(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00430     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00431       *pout = (*pi1) / (*pi2);
00432     });
00433     // since we cannot put a ifdef in a macro, we repeat the code here
00434 #ifdef __DEBUG__
00435     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00436         if (*pi2 == 0)
00437           eblerror("division by zero");
00438       *pout = (*pi1) / (*pi2);
00439     });
00440 #endif
00441   }
00442 
00444   // idx_addc
00445 
00446   template<class T, class T2> void idx_addc(idx<T> & inp, T2 c, idx<T> & out) {
00447     idx_addc(inp, saturate(c, T), out);
00448   }
00449 
00450   template<class T> void idx_addc(idx<T> &inp, T c, idx<T> &out) {
00451     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = *pinp + c; });
00452   }
00453 
00454   template<class T> void idx_addc(idx<T> &inp, T c) {
00455     idx_aloopf1(pinp, inp, T, { *pinp += c; });
00456   }
00457 
00459   // idx_addc_bounded
00460 
00461   template<class T, class T2>
00462   void idx_addc_bounded(idx<T> & inp, T2 c, idx<T> & out) {
00463     idx_addc_bounded(inp, saturate(c, T), out);
00464   }
00465 
00466 
00467   template<class T> void idx_addc_bounded(idx<T> &inp, T c, idx<T> &out) {
00468     idx_aloopf2(pinp, inp, T, pout, out, T, {
00469         *pout = saturate(*pinp + c, T);
00470       });
00471   }
00472 
00474   // idx_subc_bounded
00475 
00476   template<class T, class T2>
00477   void idx_subc_bounded(idx<T> &inp, T2 c, idx<T> &out) {
00478     idx_subc_bounded(inp, saturate(c, T), out);
00479   }
00480 
00481 
00482   template<class T> void idx_subc_bounded(idx<T> &inp, T c, idx<T> &out) {
00483     idx_aloopf2(pinp, inp, T, pout, out, T, {
00484         *pout = saturate(*pinp - c, T);
00485       });
00486   }
00487 
00489   // idx_addcacc
00490 
00491   template<class T, class T2>
00492   void idx_addcacc(idx<T> & inp, T2 c, idx<T> & out) {
00493     idx_addcacc(inp, saturate(c, T), out);
00494   }
00495 
00496   template<class T> void idx_addcacc(idx<T> &inp, T c, idx<T> &out) {
00497     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += *pinp + c; });
00498   }
00499 
00501   // idx_dotc
00502 
00503   template<class T, class T2> void idx_dotc(idx<T> &inp, T2 c, idx<T> &out) {
00504     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = (T)(*pinp * c); });
00505   }
00506 
00508   // idx_dotc_bounded
00509 
00510   template<class T, class T2>
00511   void idx_dotc_bounded(idx<T> &inp, T2 c, idx<T> &out) {
00512     idx_aloopf2(pinp, inp, T, pout, out, T, {
00513         *pout = saturate(*pinp * c, T);
00514       });
00515   }
00516 
00518   // idx_dotcacc
00519 
00520   template<class T, class T2> void idx_dotcacc(idx<T> &inp, T2 c, idx<T> &out) {
00521     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout += (T)(*pinp * c); });
00522   }
00523 
00525   // idx_signdotc
00526 
00527   template<class T, class T2>
00528   void idx_signdotc(idx<T> &inp, T2 c, idx<T> &out) {
00529     idx_digndotc(inp, saturate(c, T), out);
00530   }
00531 
00532   template<class T> void idx_signdotc(idx<T> &inp, T c, idx<T> &out) {
00533     idx_aloopf2(pinp, inp, T, pout, out, T, {
00534         *pout = (*pinp < 0) ? -c : c;
00535       });
00536   }
00537 
00539   // idx_signdotcacc
00540 
00541   template<class T, class T2>
00542   void idx_signdotcacc(idx<T> &inp, T2 c, idx<T> &out) {
00543     idx_digndotcacc(inp, saturate(c, T), out);
00544   }
00545 
00546   template<class T> void idx_signdotcacc(idx<T> &inp, T c, idx<T> &out) {
00547     idx_aloopf2(pinp, inp, T, pout, out, T, {
00548         *pout += (*pinp < 0) ? -c : c;
00549       });
00550   }
00551 
00553   // idx_subsquare
00554 
00555   template<class T> void idx_subsquare(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00556     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00557         T d = *pi1 - *pi2;
00558         *pout = d*d;
00559       });
00560   }
00561 
00562   template<class T> void idx_subsquareacc(idx<T> &i1, idx<T> &i2, idx<T> &out) {
00563     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00564         T d = *pi1 - *pi2;
00565         *pout += d*d;
00566       });
00567   }
00568 
00569   // idx_lincom ////////////////////////////////////////////////////////////////
00570 
00571   // not very efficient. There must be a more parallel way of doing this
00572   template<class T>
00573   void idx_lincomb(idx<T> &i1, T k1, idx<T> &i2, T k2, idx<T> &out) {
00574     idx_aloopf3(pi1, i1, T, pi2, i2, T, pout, out, T, {
00575       *pout = k1 * (*pi1) + k2 * (*pi2);
00576     });
00577   }
00578 
00579   // idx_tanh //////////////////////////////////////////////////////////////////
00580 
00581   template<class T> void idx_tanh(idx<T> &inp, idx<T> &out) {
00582     idx_checknelems2_all(inp, out);
00583 #ifdef __OPENMP__
00584     T* inptr = inp.idx_ptr();
00585     T* outptr = out.idx_ptr();
00586 #pragma omp parallel for
00587     for(int i=0; i< inp.nelements(); ++i) {
00588       outptr[i] = (T)(tanh((double)inptr[i]));
00589     }
00590 #else
00591     idx_aloopf2(pinp, inp, T, pout, out, T, {
00592         *pout = (T)(tanh((double)*pinp));
00593       });
00594 #endif
00595   }
00596 
00597   // idx_dtanh /////////////////////////////////////////////////////////////////
00598 
00599   template<class T> void idx_dtanh(idx<T> &inp, idx<T> &out) {
00600     idx_checknelems2_all(inp, out);
00601     idx_aloopf2(pinp, inp, T, pout, out, T, {
00602         *pout = (T) (dtanh((double)*pinp));
00603       });
00604   }
00605 
00606   // idx_stdsigmoid ////////////////////////////////////////////////////////////
00607 
00608   template<class T> void idx_stdsigmoid(idx<T> &inp, idx<T> &out) {
00609     idx_checknelems2_all(inp, out);
00610     idx_aloopf2(pinp, inp, T, pout, out, T, {
00611         *pout = (T) (stdsigmoid((double)*pinp));
00612       });
00613   }
00614 
00615   // idx_dstdsigmoid ///////////////////////////////////////////////////////////
00616 
00617   template<class T> void idx_dstdsigmoid(idx<T> &inp, idx<T> &out) {
00618     idx_checknelems2_all(inp, out);
00619     idx_aloopf2(pinp, inp, T, pout, out, T, {
00620         *pout = (T) (dstdsigmoid((double)*pinp));
00621       });
00622   }
00623 
00624   // idx_abs ///////////////////////////////////////////////////////////////////
00625 
00626   template<class T> inline T abs2 (T a) {
00627     return (T)abs((float64)a);
00628   }
00629   template<> inline float abs2 (float a) {
00630     return fabs(a);
00631   }
00632   template<> inline byte abs2 (byte a) {
00633     if (a >= 0)
00634       return a;
00635     return (a == -128) ? 127 : -a;
00636   }
00637   template<> inline int16 abs2 (int16 a) {
00638     if (a >= 0)
00639       return a;
00640     return (a == -32768) ? 32767 : -a;
00641   }
00642 
00643   template<class T> void idx_abs(idx<T>& inp, idx<T>& out) {
00644     idx_aloopf2(pinp, inp, T, pout, out, T, { *pout = abs2<T>(*pinp); });
00645   }
00646 
00647 
00649   // idx_threshold_acc
00650 
00651   template<class T, class T2, class T3>
00652   void idx_thresdotc_acc(idx<T>& in, T2 c, T3 th, idx<T>& out) {
00653     idx_thresdotc_acc(in, saturate(c, T), saturate(th, T), out);
00654   }
00655 
00656   template<class T> void idx_thresdotc_acc(idx<T>& in, T c, T th, idx<T>& out) {
00657     idx_aloopf2(pin, in, T, pout, out, T, {
00658       *pout += (*pin < -th) ? -c : (*pin > th) ? c : 0;
00659     });
00660   }
00661 
00663   // idx_threshold (in-place)
00664 
00665   template<class T, class T2> void idx_threshold(idx<T> & in, T2 th) {
00666     idx_threshold(in, saturate(th, T));
00667   }
00668 
00669   template<class T> void idx_threshold(idx<T>& in, T th) {
00670     idx_aloopf1(pin, in, T, { if (*pin < th) *pin = th; });
00671   }
00672 
00673   template<class T> void idx_threshold2(idx<T>& in, T th) {
00674     idx_aloopf1(pin, in, T, { if (*pin > th) *pin = th; });
00675   }
00676 
00678   // idx_threshold (not-in-place)
00679 
00680   template<class T, class T2>
00681   void idx_threshold(idx<T> & in, T2 th, idx<T> & out) {
00682     idx_threshold(in, saturate(th, T), out);
00683   }
00684 
00685   template<class T> void idx_threshold(idx<T>& in, T th, idx<T>& out) {
00686     idx_checknelems2_all(in, out);
00687     idx_aloopf2(pin, in, T, pout, out, T, {
00688       if (*pin < th)
00689         *pout = th;
00690       else
00691         *pout = *pin;
00692     });
00693   }
00694 
00696   // idx_threshold (in-place, with value)
00697 
00698   template<class T, class T2, class T3>
00699   void idx_threshold(idx<T> & in, T2 th, T3 value) {
00700     idx_threshold(in, saturate(th, T), saturate(value, T));
00701   }
00702 
00703   template<class T> void idx_threshold(idx<T>& in, T th, T value) {
00704     idx_aloopf1(pin, in, T, { if (*pin < th) *pin = value; });
00705   }
00706 
00708   // idx_threshold (not-in-place, with value)
00709 
00710   template<class T, class T2, class T3>
00711   void idx_threshold(idx<T> & in, T2 th, T3 value, idx<T> & out) {
00712     idx_threshold(in, saturate(th, T), saturate(value, T), out);
00713   }
00714 
00715 
00716   template<class T> void idx_threshold(idx<T>& in, T th, T value, idx<T>& out) {
00717     idx_aloopf2(pin, in, T, pout, out, T, {
00718         if (*pin < th)
00719           *pout = value;
00720         else
00721           *pout = *pin;
00722       });
00723   }
00724 
00726   // idx_threshold (with above and below)
00727 
00728   template<class T, class T2>
00729   void idx_threshold(idx<T>& in, T th, T2 below, T2 above, idx<T2>& out) {
00730     idx_checknelems2_all(in, out);
00731     idx_aloopf2(pin, in, T, pout, out, T2, {
00732       *pout = (*pin < th) ? below : above;
00733     });
00734   }
00735 
00737   // idx_sqrt
00738 
00739   template<class T> void idx_sqrt(idx<T>& in, idx<T>& out) {
00740     idx_checknelems2_all(in, out);
00741     idx_aloopf2(pin, in, T, pout, out, T, {
00742         *pout = (T)sqrt((float64)(*pin));
00743       });
00744   }
00745 
00747   // idx_power
00748 
00749   template<class T, class T2> void idx_power(idx<T> & in, T2 p, idx<T> & out) {
00750     idx_checknelems2_all(in, out);
00751     idx_power(in, saturate(p, T), out);
00752   }
00753 
00754   template<class T> void idx_power(idx<T>& in, T p, idx<T>& out) {
00755     idx_checknelems2_all(in, out);
00756     idx_aloopf2(pin, in, T, pout, out, T, {
00757         *pout = saturate(pow((double)(*pin), (double)p), T);
00758     });
00759   }
00760 
00762   // idx_sum
00763 //#ifndef __OPENMP__
00764 
00765   // there is a much faster and parallel way
00766   // of doing this using a tree.
00767   template<typename Tout, typename T> Tout idx_sum(idx<T> &inp) {
00768     Tout z = 0;
00769     idx_aloopf1(pinp, inp, T, { z += (Tout)(*pinp); });
00770     return z;
00771   }
00772 
00773   // there is a much faster and parallel way
00774   // of doing this using a tree.
00775   template<class T> T idx_sum(idx<T> &inp, T *out) {
00776     T z = idx_sum(inp);
00777     if (out != NULL)
00778       *out = saturate(z, T);
00779     return z;
00780   }
00781 
00782   // there is a much faster and parallel way
00783   // of doing this using a tree.
00784   template<class T> T idx_sum(idx<T> &inp) {
00785     T z = 0;
00786     idx_aloopf1(pinp, inp, T, { z += (T)(*pinp); });
00787     return z;
00788   }
00789 
00790 // #endif /* ifndef __OPENMP__ */
00791 
00793   // idx_sumabs
00794 
00795   // there is a much faster and parallel way
00796   // of doing this using a tree.
00797   template<class T> float64 idx_sumabs(idx<T> &inp, T *out) {
00798     float64 z = 0;
00799     idx_aloopf1(pinp, inp, T, { z += abs((float64)(*pinp)); });
00800     if (out != NULL) {
00801       *out = saturate(z, T);
00802       return *out;
00803     }
00804     return z;
00805   }
00806 
00808   // idx_sumsqr
00809 
00810   // there is a much faster and parallel way
00811   // of doing this using a tree.
00812   template<class T> float64 idx_sumsqr(idx<T> &inp) {
00813     float64 z = 0;
00814     idx_aloopf1(pinp, inp, T, { z += ((float64)(*pinp))*((float64)(*pinp)); });
00815     return z;
00816   }
00817 
00819   // idx_sumacc
00820 
00821   template<class T> float64 idx_sumacc(idx<T> &inp, idx<T> &acc) {
00822     // acc must be of order 0.
00823     if (acc.order() != 0)
00824       eblerror("expecting an idx0 as output");
00825     float64 sum = (float64)acc.get() + idx_sum(inp);
00826     acc.set(saturate(sum, T));
00827     return sum;
00828     //return idx_sum(inp, acc.ptr());
00829   }
00830 
00832   // idx_sumabs (with acc)
00833 
00834   template<class T> float64 idx_sumabs(idx<T> &inp, idx<T> &acc) {
00835     // acc must be of order 0.
00836     if (acc.order() != 0)
00837       eblerror("expecting an idx0 as output");
00838     float64 sum = (float64)acc.get() + idx_sumabs(inp);
00839     acc.set(saturate(sum, T));
00840     return sum;
00841     //return idx_sumabs(inp, acc.ptr());
00842   }
00843 
00845   // idx_l1
00846 
00847   template<class T> float64 idx_l1(idx<T> &m1, idx<T> &m2) {
00848     idx_checknelems2_all(m1, m2);
00849     float64 z = 0;
00850     idx_aloopf2(pm1, m1, T, pm2, m2, T,
00851                 { z += (float64) fabs((float64) (*pm1 - *pm2)); });
00852     return z;
00853   }
00854 
00856   // idx_l2norm
00857 
00858   template<class T> float64 idx_l2norm(idx<T> &in) {
00859     return sqrt((T) idx_sumsqr(in));
00860   }
00861 
00862   template<class T> float64 idx_l2(idx<T> &in) {
00863     return sqrt((T) idx_sumsqr(in));
00864   }
00865 
00866   template<class T> float64 idx_l2(idx<T> &m1, idx<T> &m2) {
00867     idx_checknelems2_all(m1, m2);
00868     float64 z = 0;
00869     float64 tmp;
00870     idx_aloopf2(pm1, m1, T, pm2, m2, T,
00871                 { tmp = *pm1 - (float64) *pm2;
00872                   z += tmp * tmp; });
00873     return sqrt(z);
00874   }
00875 
00876   template<class T> float64 idx_l2(midx<T> &m1, midx<T> &m2) {
00877     idx_checknelems2_all(m1, m2);
00878     float64 z = 0;
00879     idx_aloopf2(pe1, ((idx<idx<T>*>&) m1), idx<T>*,
00880                 pe2, ((idx<idx<T>*>&) m2), idx<T>*, {
00881                   idx<T> *e1 = *pe1; idx<T> *e2 = *pe2;
00882                   if (e1 && e2)
00883                     z += idx_l2(*e1, *e2);
00884                   else if (e1 != e2)
00885                     eblerror("expected both matrices to be empty in " << m1
00886                              << " and " << m2);
00887                 });
00888     return sqrt(z);
00889   }
00890 
00891   template<class T> float64 idx_l2common(midx<T> &m1, midx<T> &m2) {
00892     idx_checknelems2_all(m1, m2);
00893     float64 z = 0;
00894     idx_aloopf2(pe1, ((idx<idx<T>*>&) m1), idx<T>*,
00895                 pe2, ((idx<idx<T>*>&) m2), idx<T>*, {
00896                   idx<T> *e11 = *pe1; idx<T> *e22 = *pe2;
00897                   if (e11 && e22) {
00898                     idx<T> e1 = *e11; idx<T> e2 = *e22;
00899                     idx_checkorder2_all(e1, e2);
00900                     for (intg i = 0; i < e1.order(); ++i) {
00901                       e1 = e1.narrow(i, std::min(e1.dim(i), e2.dim(i)), 0);
00902                       e2 = e2.narrow(i, std::min(e1.dim(i), e2.dim(i)), 0);
00903                     }
00904                     z += idx_l2(e1, e2);
00905                   }
00906                 });
00907     return sqrt(z);
00908   }
00909 
00911   // idx_mean
00912 
00913   template<class T> T idx_mean(idx<T> &in, T *out) {
00914     if (out != NULL) {
00915       *out = (T)(idx_sum(in) / (float64)in.nelements());
00916       return *out;
00917     }
00918     return (T)(idx_sum(in) / (float64)in.nelements());
00919   }
00920 
00922   // idx_std_normalize
00923 
00924   template<typename T>
00925   void idx_std_normalize(idx<T> &in, idx<T> &out, T *mean_) {
00926     T mean = mean_ ? *mean_ : idx_mean(in);
00927     idx_addc(in, (T)-mean, out); // remove mean
00928     // std deviation
00929 #ifdef __WINDOWS__
00930     T coeff = (T) sqrt((double) (idx_sumsqr(out) / out.nelements()));
00931 #else
00932     T coeff = (T) sqrt((T) (idx_sumsqr(out) / out.nelements()));
00933 #endif
00934     if (coeff != 0)
00935       idx_dotc(out, 1 / coeff, out);
00936   }
00937 
00939   // idx_flip
00940 
00941   template <class T> idx<T> idx_flip(idx<T> &m, uint n, idx<T> *m2) {
00942     if (m2 == NULL) {
00943       idx<T> flipped(m.get_idxdim());
00944       return idx_flip(m, n, &flipped);
00945     } else {
00946       if (!m.same_dim(m2->get_idxdim())) {
00947         eblerror("expected same dim idx in idx_flip, but got " << m
00948                  << " and " << *m2);
00949       } else {
00950         if (n == 0) {
00951           // we reached the dimension we want to flip, flip it
00952           for (intg i = 0; i < m.dim(0); ++i) {
00953             idx<T> a = m.select(0, i);
00954             idx<T> b = m2->select(0, m.dim(0) - i - 1);
00955             idx_copy(a, b);
00956           }
00957         } else {
00958           // we didn't reach the dimension to flip, call another recursion
00959           idx_bloop2(mm, m, T, mm2, *m2, T) {
00960             idx_flip(mm, n - 1, &mm2);
00961           }
00962         }
00963       }
00964       return *m2;
00965     }
00966   }
00967 
00968   template <class T> midx<T> idx_flip(midx<T> &m, uint n, midx<T> *m2) {
00969     if (m2 == NULL) {
00970       midx<T> flipped(m.get_idxdim());
00971       // allocate all elements
00972       flipped.resize(m);
00973       return idx_flip(m, n, &flipped);
00974     } else {
00975       if (!m.same_dim(*m2)) {
00976         eblerror("expected same dim idx in idx_flip, but got " << m
00977                  << " and " << *m2);
00978       } else {
00979         // loop on each object of m and flip it
00980         if (m.order() == 1) {
00981           for (intg i = 0; i < m.dim(0); ++i) {
00982             idx<T> tmp = m.get(i);
00983             tmp = idx_flip(tmp, n);
00984             m2->set(tmp, i);
00985           }
00986         } else if (m.order() == 2) {
00987           for (intg i = 0; i < m.dim(0); ++i)
00988             for (intg j = 0; j < m.dim(1); ++j) {
00989               idx<T> tmp = m.get(i, j);
00990               tmp = idx_flip(tmp, n);
00991               m2->set(tmp, i, j);
00992             }
00993         }
00994       }
00995       return *m2;
00996     }
00997   }
00998 
00999   template<class T> void rev_idx2 (idx<T> &m) {
01000     idx_check_contiguous1(m);
01001     if (m.order() != 2)
01002       idx_compatibility_error1(m, "expecting idx of order 2");
01003     T tmp, *p = m.idx_ptr();
01004     intg size = m.dim(0) * m.dim(1);
01005     intg i;
01006 
01007     for (i = 0; i < size/2; i++) {
01008       tmp = p[i];
01009       p[i] = p[size-i-1];
01010       p[size-i-1] = tmp;
01011     }
01012   }
01013 
01014   template<class T> void rev_idx2_tr (idx<T> &m, idx<T> &n) {
01015     idx_check_contiguous2(m, n);
01016     if ((m.order() != 2) || (n.order() != 2))
01017       idx_compatibility_error2(m, n, "expecting idx of order 2");
01018     intg size = m.dim(0) * m.dim(1);
01019     T *p = m.idx_ptr() + size - 1;
01020     T *q = n.idx_ptr();
01021     for (int i = 0; i < size; i++) {
01022       *(q++) = *(p--);
01023     }
01024   }
01025 
01027   // idx's products
01028 
01029   /* multiply M2 by M1, result in M1 */
01030   /* matrix - vector product */
01031   template<class T>
01032   void idx_m2dotm1(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01033     T *c1, *c2, *c1_0, *ker;
01034     intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0);
01035     intg j, jmax = i2.dim(0);
01036     intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0);
01037     T *d1, f;
01038     intg i, imax = o1.dim(0);
01039     c1_0 = i1.idx_ptr();
01040     ker = i2.idx_ptr();
01041     d1 = o1.idx_ptr();
01042     for (i=0; i<imax; i++){
01043       f = 0;
01044       c1 = c1_0;
01045       c2 = ker;
01046       if(c1_m1==1 && c2_m0==1)
01047         for (j=0; j<jmax; j++)
01048           f += (*c1++)*(*c2++);
01049       else
01050         for (j=0; j<jmax; j++) {
01051           f += (*c1)*(*c2);
01052           c1 += c1_m1;
01053           c2 += c2_m0;
01054         }
01055       *d1 = f;
01056       d1 += d1_m0;
01057       c1_0 += c1_m0;
01058     }
01059   }
01060 
01061   /* multiply M2 by M1, result in M1 */
01062   template <class T>
01063   void idx_m2dotm1acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01064     T *c1, *c2, *c1_0, *ker;
01065     intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0);
01066     intg j, jmax = i2.dim(0);
01067     intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0);
01068     T *d1, f;
01069     intg i, imax = o1.dim(0);
01070     c1_0 = i1.idx_ptr();
01071     ker = i2.idx_ptr();
01072     d1 = o1.idx_ptr();
01073     for (i=0; i<imax; i++){
01074       f = *d1;
01075       c1 = c1_0;
01076       c2 = ker;
01077       for (j=0; j<jmax; j++) {
01078         f += (*c1)*(*c2);
01079         c1 += c1_m1;
01080         c2 += c2_m0;
01081       }
01082       *d1 = f;
01083       d1 += d1_m0;
01084       c1_0 += c1_m0;
01085     }
01086   }
01087 
01088   template <class T>
01089   void idx_m2dotm3(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01090     idx_bloop2(ii2, i2, T, oo1, o1, T) {
01091       idx_bloop2(iii2, ii2, T, ooo1, oo1, T) {
01092         idx_m2dotm1(i1, iii2, ooo1);
01093       }
01094     }      
01095   }
01096   
01097   // m2dotm2 ///////////////////////////////////////////////////////////////////
01098 
01099   template <class T>
01100   void idx_m2dotm2(idx<T> &i1, idx<T> &i2, idx<T> &o) {
01101     idx_checkorder3(i1, 2, i2, 2, o, 2);
01102     if (i1.dim(1) != i2.dim(0) || i1.dim(0) != o.dim(0) ||
01103         o.dim(1) != i2.dim(1))
01104       eblerror("incompatible dimensions for matrix-matrix multiplication of "
01105                << i1 << " . " << i2 << " -> " << o);
01106     T *c1, *c2, *c1_0, *ker;
01107     intg c1_m1 = i1.mod(1), c2_m0 = i2.mod(0);
01108     intg j, jmax = i2.dim(0);
01109     intg c1_m0 = i1.mod(0), d1_m0 = o.mod(0);
01110     T *d1, f;
01111     intg i, imax = o.dim(0);
01112     intg k, kmax = o.dim(1);
01113     // loop on o.dim(1)
01114     for (k = 0; k < kmax; ++k) {
01115       c1_0 = i1.idx_ptr();
01116       d1 = o.idx_ptr() + k;
01117       ker = i2.idx_ptr() + k;
01118       // loop on o.dim(0)
01119       for (i=0; i<imax; i++){
01120         f = 0;
01121         c1 = c1_0;
01122         c2 = ker;
01123         // loop on
01124         if(c1_m1==1 && c2_m0==1)
01125           for (j=0; j<jmax; j++)
01126             f += (*c1++)*(*c2++);
01127         else
01128           for (j=0; j<jmax; j++) {
01129             f += (*c1)*(*c2);
01130             c1 += c1_m1;
01131             c2 += c2_m0;
01132           }
01133         *d1 = f;
01134         d1 += d1_m0;
01135         c1_0 += c1_m0;
01136       }
01137     }
01138   }
01139 
01140   // m4dotm2 ///////////////////////////////////////////////////////////////////
01141 
01142   // TODO-0 write specialized blas version in cpp
01143   template<class T> void idx_m4dotm2(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01144     idx_checkorder3(i1, 4, i2, 2, o1, 2); // check for compatible orders
01145     if ((i1.dim(0) != o1.dim(0)) || (i1.dim(1) != o1.dim(1))
01146         || (i1.dim(2) != i2.dim(0)) || (i1.dim(3) != i2.dim(1)))
01147       idx_compatibility_error3(i1, i2, o1, "incompatible dimensions");
01148     T *c1, *c1_2;
01149     T *c2, *c2_0;
01150     T *c1_0, *c1_1;
01151     T *ker;
01152     intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0);
01153     intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1);
01154     intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1);
01155     T *d1_0, *d1, f;
01156     intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0);
01157     intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1);
01158     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01159     c1_0 = i1.idx_ptr();
01160     ker = i2.idx_ptr();
01161     d1_0 = o1.idx_ptr();
01162     for (i=0; i<imax; i++) {
01163       c1_1 = c1_0;
01164       d1 = d1_0;
01165       for (j=0; j<jmax; j++) {
01166         f = 0;
01167         c1_2 = c1_1;
01168         c2_0 = ker;
01169         for (k=0; k<kmax; k++) {
01170           c1 = c1_2;
01171           c2 = c2_0;
01172           for (l=0; l<lmax; l++) {
01173             f += (*c1)*(*c2);
01174             c1 += c1_m3;
01175             c2 += c2_m1;
01176           }
01177           c1_2 += c1_m2;
01178           c2_0 += c2_m0;
01179         }
01180         *d1 = f;
01181         d1 += d1_m1;
01182         c1_1 += c1_m1;
01183       }
01184       d1_0 += d1_m0;
01185       c1_0 += c1_m0;
01186     }
01187   }
01188 
01189   // TODO-0 write specialized blas version in cpp
01190   template<class T> void idx_m4dotm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01191     idx_checkorder3(i1, 4, i2, 2, o1, 2); // check for compatible orders
01192     if ((i1.dim(0) != o1.dim(0)) || (i1.dim(1) != o1.dim(1))
01193         || (i1.dim(2) != i2.dim(0)) || (i1.dim(3) != i2.dim(1)))
01194       idx_compatibility_error3(i1, i2, o1, "incompatible dimensions");
01195     T *c1, *c1_2;
01196     T *c2, *c2_0;
01197     T *c1_0, *c1_1;
01198     T *ker;
01199     intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0);
01200     intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1);
01201     intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1);
01202     T *d1_0, *d1, f;
01203     intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0);
01204     intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1);
01205     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01206     c1_0 = i1.idx_ptr();
01207     ker = i2.idx_ptr();
01208     d1_0 = o1.idx_ptr();
01209     for (i=0; i<imax; i++) {
01210       c1_1 = c1_0;
01211       d1 = d1_0;
01212       for (j=0; j<jmax; j++) {
01213         f = *d1;
01214         c1_2 = c1_1;
01215         c2_0 = ker;
01216         for (k=0; k<kmax; k++) {
01217           c1 = c1_2;
01218           c2 = c2_0;
01219           for (l=0; l<lmax; l++) {
01220             f += (*c1)*(*c2);
01221             c1 += c1_m3;
01222             c2 += c2_m1;
01223           }
01224           c1_2 += c1_m2;
01225           c2_0 += c2_m0;
01226         }
01227         *d1 = f;
01228         d1 += d1_m1;
01229         c1_1 += c1_m1;
01230       }
01231       d1_0 += d1_m0;
01232       c1_0 += c1_m0;
01233     }
01234   }
01235 
01236   template<class T> void idx_m4squdotm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01237     idx_checkorder3(i1, 4, i2, 2, o1, 2);
01238     T *c1, *c1_2;
01239     T *c2, *c2_0;
01240     T *c1_0, *c1_1;
01241     T *ker;
01242     intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0);
01243     intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1);
01244     intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1);
01245     T *d1_0, *d1, f;
01246     intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0);
01247     intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1);
01248     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01249     c1_0 = i1.idx_ptr();
01250     ker = i2.idx_ptr();
01251     d1_0 = o1.idx_ptr();
01252     for (i=0; i<imax; i++) {
01253       c1_1 = c1_0;
01254       d1 = d1_0;
01255       for (j=0; j<jmax; j++) {
01256         f = *d1;
01257         c1_2 = c1_1;
01258         c2_0 = ker;
01259         for (k=0; k<kmax; k++) {
01260           c1 = c1_2;
01261           c2 = c2_0;
01262           for (l=0; l<lmax; l++) {
01263             f += (*c1)*(*c1)*(*c2);
01264             c1 += c1_m3;
01265             c2 += c2_m1;
01266           }
01267           c1_2 += c1_m2;
01268           c2_0 += c2_m0;
01269         }
01270         *d1 = f;
01271         d1 += d1_m1;
01272         c1_1 += c1_m1;
01273       }
01274       d1_0 += d1_m0;
01275       c1_0 += c1_m0;
01276     }
01277   }
01278 
01279   template<class T> void idx_m2extm2(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01280     idx_checkorder3(i1, 2, i2, 2, o1, 4);
01281     T *c2_0, *c2_1;
01282     T *d1_2, *d1_3;
01283     T *d1_0, *d1_1;
01284     T *c1_0, *c1_1;
01285     T *ker;
01286     intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1);
01287     intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3);
01288     intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1);
01289     intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1);
01290     intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2);
01291     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01292     c1_0 = i1.idx_ptr();
01293     ker = i2.idx_ptr();
01294     d1_0 = o1.idx_ptr();
01295     for (i=0; i<imax; i++) {
01296       d1_1 = d1_0;
01297       c1_1 = c1_0;
01298       for (j=0; j<jmax; j++) {
01299         d1_2 = d1_1;
01300         c2_0 = ker;
01301         for (k=0; k<kmax; k++) {
01302           d1_3 = d1_2;
01303           c2_1 = c2_0;
01304           for (l=0; l<lmax; l++) {
01305             *d1_3 = (*c1_1)*(*c2_1);
01306             d1_3 += d1_m3;
01307             c2_1 += c2_m1;
01308           }
01309           d1_2 += d1_m2;
01310           c2_0 += c2_m0;
01311         }
01312         d1_1 += d1_m1;
01313         c1_1 += c1_m1;
01314       }
01315       d1_0 += d1_m0;
01316       c1_0 += c1_m0;
01317     }
01318   }
01319 
01320   template<class T> void idx_m2extm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01321     idx_checkorder3(i1, 2, i2, 2, o1, 4);
01322     T *c2_0, *c2_1;
01323     T *d1_2, *d1_3;
01324     T *d1_0, *d1_1;
01325     T *c1_0, *c1_1;
01326     T *ker;
01327     intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1);
01328     intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3);
01329     intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1);
01330     intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1);
01331     intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2);
01332     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01333     c1_0 = i1.idx_ptr();
01334     ker = i2.idx_ptr();
01335     d1_0 = o1.idx_ptr();
01336     for (i=0; i<imax; i++) {
01337       d1_1 = d1_0;
01338       c1_1 = c1_0;
01339       for (j=0; j<jmax; j++) {
01340         d1_2 = d1_1;
01341         c2_0 = ker;
01342         for (k=0; k<kmax; k++) {
01343           d1_3 = d1_2;
01344           c2_1 = c2_0;
01345           for (l=0; l<lmax; l++) {
01346             *d1_3 += (*c1_1)*(*c2_1);
01347             d1_3 += d1_m3;
01348             c2_1 += c2_m1;
01349           }
01350           d1_2 += d1_m2;
01351           c2_0 += c2_m0;
01352         }
01353         d1_1 += d1_m1;
01354         c1_1 += c1_m1;
01355       }
01356       d1_0 += d1_m0;
01357       c1_0 += c1_m0;
01358     }
01359   }
01360 
01361   template<class T> void idx_m2squextm2acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01362     idx_checkorder3(i1, 2, i2, 2, o1, 4);
01363     T *c2_0, *c2_1;
01364     T *d1_2, *d1_3;
01365     T *d1_0, *d1_1;
01366     T *c1_0, *c1_1;
01367     T *ker;
01368     intg c2_m0 = (i2).mod(0), c2_m1 = (i2).mod(1);
01369     intg d1_m2 = (o1).mod(2), d1_m3 = (o1).mod(3);
01370     intg c1_m0 = (i1).mod(0), c1_m1 = (i1).mod(1);
01371     intg d1_m0 = (o1).mod(0), d1_m1 = (o1).mod(1);
01372     intg k,l, lmax = (o1).dim(3), kmax = (o1).dim(2);
01373     intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1);
01374     c1_0 = i1.idx_ptr();
01375     ker = i2.idx_ptr();
01376     d1_0 = o1.idx_ptr();
01377     for (i=0; i<imax; i++) {
01378       d1_1 = d1_0;
01379       c1_1 = c1_0;
01380       for (j=0; j<jmax; j++) {
01381         d1_2 = d1_1;
01382         c2_0 = ker;
01383         for (k=0; k<kmax; k++) {
01384           d1_3 = d1_2;
01385           c2_1 = c2_0;
01386           for (l=0; l<lmax; l++) {
01387             *d1_3 += (*c1_1)*(*c2_1)*(*c2_1);
01388             d1_3 += d1_m3;
01389             c2_1 += c2_m1;
01390           }
01391           d1_2 += d1_m2;
01392           c2_0 += c2_m0;
01393         }
01394         d1_1 += d1_m1;
01395         c1_1 += c1_m1;
01396       }
01397       d1_0 += d1_m0;
01398       c1_0 += c1_m0;
01399     }
01400   }
01401 
01402   // Fu Jie Huang, May 20, 2008
01403   template<typename T> void idx_m2squdotm2(idx<T>& i1, idx<T>& i2, idx<T>& o) {
01404     idx_checkorder3(i1, 2, i2, 2, o, 0);
01405     idx_checkdim2(i1, 0, i2.dim(0), i1, 1, i2.dim(1));
01406     intg imax = i1.dim(0), jmax = i2.dim(1);
01407     intg c1_m0 = i1.mod(0), c2_m0 = i2.mod(0);
01408     intg c1_m1 = i1.mod(1), c2_m1 = i2.mod(1);
01409 
01410     T *c1_0 = i1.idx_ptr();
01411     T *c2_0 = i2.idx_ptr();
01412     T *d1 = o.idx_ptr();
01413     T *c1, *c2;
01414 
01415     T f = 0;
01416     for(int i=0; i<imax; ++i) {
01417       c1 = c1_0;
01418       c2 = c2_0;
01419       for(int j=0; j<jmax; ++j) {
01420         f += (*c1) * (*c1) * (*c2);   // only difference
01421         c1 += c1_m1;
01422         c2 += c2_m1;
01423       }
01424       c1_0 += c1_m0;
01425       c2_0 += c2_m0;
01426     }
01427     *d1 = f;
01428   }
01429 
01430   // Fu Jie Huang, May 20, 2008
01431   template<typename T>
01432   void idx_m2squdotm2acc(idx<T>& i1, idx<T>& i2, idx<T>& o) {
01433     idx_checkorder3(i1, 2, i2, 2, o, 0);
01434     idx_checkdim2(i1, 0, i2.dim(0), i1, 1, i2.dim(1));
01435     intg imax = i1.dim(0), jmax = i2.dim(1);
01436     intg c1_m0 = i1.mod(0), c2_m0 = i2.mod(0);
01437     intg c1_m1 = i1.mod(1), c2_m1 = i2.mod(1);
01438 
01439     T *c1_0 = i1.idx_ptr();
01440     T *c2_0 = i2.idx_ptr();
01441     T *d1 = o.idx_ptr();
01442     T *c1, *c2;
01443 
01444     T f = *d1;          //   only difference: accumulate
01445     for(int i=0; i<imax; ++i) {
01446       c1 = c1_0;
01447       c2 = c2_0;
01448       for(int j=0; j<jmax; ++j) {
01449         f += (*c1) * (*c1) * (*c2);
01450         c1 += c1_m1;
01451         c2 += c2_m1;
01452       }
01453       c1_0 += c1_m0;
01454       c2_0 += c2_m0;
01455     }
01456     *d1 = f;
01457   }
01458 
01460   // idx_m2oversample
01461 
01462   // Fu Jie Huang, May 20, 2008
01463   template<typename T>
01464   void idx_m2oversample(idx<T>& small, intg nlin, intg ncol, idx<T>& big) {
01465     idx<T> uin  = big.unfold(0, nlin, nlin);
01466     idx<T> uuin = uin.unfold(1, ncol, ncol);
01467     idx_eloop1(z1, uuin, T) {
01468       idx_eloop1(z2, z1, T) {
01469         idx_copy(small, z2);
01470       }
01471     }
01472   }
01473 
01474   template<typename T>
01475   void idx_m2oversampleacc(idx<T>& small, intg nlin, intg ncol, idx<T>& big) {
01476     idx<T> uin  = big.unfold(0, nlin, nlin);
01477     idx<T> uuin = uin.unfold(1, ncol, ncol);
01478     idx_eloop1(z1, uuin, T) {
01479       idx_eloop1(z2, z1, T) {
01480         idx_add(small, z2, z2);
01481       }
01482     }
01483   }
01484 
01486   // idx_max
01487 
01488   template <class T> T idx_max(idx<T> &m) {
01489     T v = *(m.idx_ptr());
01490     idx_aloopf1(i, m, T, { if (*i > v) v = *i; });
01491     return v;
01492   }
01493 
01494   // idx_max (between 2 idx's, in-place)
01495   template <class T> void idx_max(idx<T> &in1, idx<T> &in2) {
01496     idx_aloopf2(i1, in1, T, i2, in2, T, { *i2 = std::max(*i1, *i2); });
01497   }
01498 
01499   // idx_max (between 2 idx's, not-in-place)
01500   template <class T> void idx_max(idx<T> &in1, idx<T> &in2, idx<T> &out) {
01501     idx_aloopf3(i1, in1, T, i2, in2, T, o, out, T, { *o = std::max(*i1, *i2);});
01502   }
01503 
01505   // idx_min
01506 
01507   template <class T> T idx_min(idx<T> &m) {
01508     T v = *(m.idx_ptr());
01509     idx_aloopf1(i, m, T, { if (*i < v) v = *i; });
01510     return v;
01511   }
01512 
01513   // idx_min (between 2 idx's, in-place)
01514   template <class T> void idx_min(idx<T> &in1, idx<T> &in2) {
01515     idx_aloopf2(i1, in1, T, i2, in2, T, { *i2 = std::min(*i1, *i2); });
01516   }
01517 
01519   // idx_indexmax
01520 
01521   template<class T> intg idx_indexmax(idx<T> &m) {
01522     intg i = 0, imax = 0;
01523     T v = *(m.idx_ptr());
01524     idx_aloopf1(me, m, T, {
01525         if (*me > v) {
01526           v = *me;
01527           imax = i;
01528         }
01529         i++;
01530       });
01531     return imax;
01532   }
01533 
01535   // idx_indexmin
01536 
01537   template<class T> intg idx_indexmin(idx<T> &m) {
01538     intg i = 0, imin = 0;
01539     T v = *(m.idx_ptr());
01540     idx_aloopf1(me, m, T, {
01541         if (*me < v) {
01542           v = *me;
01543           imin = i;
01544         }
01545         i++;
01546       });
01547     return imin;
01548   }
01549 
01551   // idx_sortdown
01552 
01553 template<class T1, class T2> void idx_sortdown(idx<T1> &m, idx<T2> &p) {
01554     idx_checkorder2(m, 1, p, 1);
01555     if (m.mod(0) != 1) eblerror("idx_sortdown: vector is not contiguous");
01556     if (p.mod(0) != 1) eblerror("idx_sortdown: vector is not contiguous");
01557     intg n = m.dim(0);
01558     intg z = p.dim(0);
01559     if (n != z) eblerror("idx_sortdown: vectors have different sizes");
01560     if (n > 1) {
01561       int l,j,ir,i;
01562       T1 *ra, rra;
01563       T2 *rb, rrb;
01564 
01565       ra = (T1*)m.idx_ptr() -1;
01566       rb = (T2*)p.idx_ptr() -1;
01567 
01568       l = (n >> 1) + 1;
01569       ir = n;
01570       for (;;) {
01571         if (l > 1) {
01572           rra=ra[--l];
01573           rrb=rb[l];
01574         } else {
01575           rra=ra[ir];
01576           rrb=rb[ir];
01577           ra[ir]=ra[1];
01578           rb[ir]=rb[1];
01579           if (--ir == 1) {
01580             ra[1]=rra;
01581             rb[1]=rrb;
01582             return ; } }
01583         i=l;
01584         j=l << 1;
01585         while (j <= ir) {
01586           if (j < ir && ra[j] > ra[j+1]) ++j;
01587           if (rra > ra[j]) {
01588             ra[i]=ra[j];
01589             rb[i]=rb[j];
01590             j += (i=j);
01591           } else j=ir+1; }
01592         ra[i]=rra;
01593         rb[i]=rrb;
01594       }
01595     }
01596   }
01597 
01600 template<class T> void idx_sortup(idx<T> &m) {
01601   idx_checkorder1(m, 1);
01602   idx_check_contiguous1(m);
01603   intg n = m.dim(0);
01604   if (n > 1) {
01605     int l,j,ir,i;
01606     T *ra, rra;
01607 
01608     ra = m.idx_ptr() - 1;
01609     l=(n >> 1)+1;
01610     ir= n;
01611     for (;;) {
01612       if (l > 1)
01613         rra=ra[--l];
01614       else {
01615         rra=ra[ir];
01616         ra[ir]=ra[1];
01617         if (--ir == 1) {
01618           ra[1]=rra;
01619           goto end;
01620         }
01621       }
01622       i=l;
01623       j=l << 1;
01624       while (j <= ir) {
01625         if (j < ir && ra[j] < ra[j+1]) ++j;
01626         if (rra < ra[j]) {
01627           ra[i]=ra[j];
01628           j += (i=j);
01629         } else j=ir+1;
01630       }
01631       ra[i]=rra;
01632     }
01633   }
01634  end:;
01635 }
01636 
01639 template<class T1, class T2> void idx_sortup(idx<T1> &m, idx<T2> &m2) {
01640   idx_checkorder2(m, 1, m2, 1);
01641   idx_check_contiguous2(m, m2);
01642   idx_checknelems2_all(m, m2); // they must have same number of elements
01643   intg n = m.dim(0);
01644   if (n > 1) {
01645     int l,j,ir,i;
01646     T1 *ra, rra;
01647     T2 *ra2, rra2;
01648 
01649     ra = m.idx_ptr() - 1;
01650     ra2 = m2.idx_ptr() - 1;
01651     l=(n >> 1)+1;
01652     ir= n;
01653     for (;;) {
01654       if (l > 1) {
01655         rra=ra[--l];
01656         rra2=ra2[l];
01657       } else {
01658         rra=ra[ir];
01659         rra2=ra2[ir];
01660         ra[ir]=ra[1];
01661         ra2[ir]=ra2[1];
01662         if (--ir == 1) {
01663           ra[1]=rra;
01664           ra2[1]=rra2;
01665           goto end;
01666         }
01667       }
01668       i=l;
01669       j=l << 1;
01670       while (j <= ir) {
01671         if (j < ir && ra[j] < ra[j+1]) ++j;
01672         if (rra < ra[j]) {
01673           ra[i]=ra[j];
01674           ra2[i]=ra2[j];
01675           j += (i=j);
01676         } else j=ir+1;
01677       }
01678       ra[i]=rra;
01679       ra2[i]=rra2;
01680     }
01681   }
01682  end:;
01683 }
01684 
01687 template<class T1, class T2, class T3> void idx_sortup(idx<T1> &m, idx<T2> &m2,
01688                                                         idx<T3> &m3) {
01689   idx_checkorder3(m, 1, m2, 1, m3, 1);
01690   idx_check_contiguous3(m, m2, m3);
01691   idx_checknelems3_all(m, m2, m3); // they must have same number of elements
01692   intg n = m.dim(0);
01693   if (n > 1) {
01694     int l,j,ir,i;
01695     T1 *ra, rra;
01696     T2 *ra2, rra2;
01697     T3 *ra3, rra3;
01698 
01699     ra = m.idx_ptr() - 1;
01700     ra2 = m2.idx_ptr() - 1;
01701     ra3 = m3.idx_ptr() - 1;
01702     l=(n >> 1)+1;
01703     ir = n;
01704     for (;;) {
01705       if (l > 1) {
01706         rra=ra[--l];
01707         rra2=ra2[l];
01708         rra3=ra3[l];
01709       } else {
01710         rra=ra[ir];
01711         rra2=ra2[ir];
01712         rra3=ra3[ir];
01713         ra[ir]=ra[1];
01714         ra2[ir]=ra2[1];
01715         ra3[ir]=ra3[1];
01716         if (--ir == 1) {
01717           ra[1]=rra;
01718           ra2[1]=rra2;
01719           ra3[1]=rra3;
01720           goto end;
01721         }
01722       }
01723       i=l;
01724       j=l << 1;
01725       while (j <= ir) {
01726         if (j < ir && ra[j] < ra[j+1]) ++j;
01727         if (rra < ra[j]) {
01728           ra[i]=ra[j];
01729           ra2[i]=ra2[j];
01730           ra3[i]=ra3[j];
01731           j += (i=j);
01732         } else j=ir+1;
01733       }
01734       ra[i]=rra;
01735       ra2[i]=rra2;
01736       ra3[i]=rra3;
01737     }
01738   }
01739  end:;
01740 }
01741 
01742 template<class T> void idx_sortdown(idx<T> &m) {
01743   idx_checkorder1(m, 1);
01744   idx_check_contiguous1(m);
01745   intg n = m.dim(0);
01746   if (n > 1) {
01747     int l,j,ir,i;
01748     T *ra, rra;
01749 
01750     ra = m.idx_ptr() - 1;
01751     l=(n >> 1)+1;
01752     ir= n;
01753     for (;;) {
01754       if (l > 1)
01755         rra=ra[--l];
01756       else {
01757         rra=ra[ir];
01758         ra[ir]=ra[1];
01759         if (--ir == 1) {
01760           ra[1]=rra;
01761           goto end; }}
01762       i=l;
01763       j=l << 1;
01764       while (j <= ir) {
01765         if (j < ir && ra[j] > ra[j+1]) ++j;
01766         if (rra > ra[j]) {
01767           ra[i]=ra[j];
01768           j += (i=j);
01769         } else j=ir+1; }
01770       ra[i]=rra;
01771     }
01772   }
01773  end:;
01774 }
01775 
01777   // idx_sqrdist
01778 
01779   template<class T> float64 idx_sqrdist(idx<T> &i1, idx<T> &i2) {
01780     idx_checknelems2_all(i1, i2);
01781     float64 z = 0;
01782     float64 tmp;
01783     idx_aloopf2(pi1, i1, T, pi2, i2, T, {
01784         tmp = (float64)(*pi1) - (float64)(*pi2);
01785         z += tmp * tmp;
01786       });
01787     return z;
01788   }
01789 
01791   // idx_sqrdist (with idx out)
01792 
01793   template<class T> void idx_sqrdist(idx<T> &i1, idx<T> &i2, idx<T> &out) {
01794     idx_checknelems2_all(i1, i2);
01795     if (out.order() != 0) eblerror("idx_sqrdist: expecting an idx of order 0");
01796     out.set((T)idx_sqrdist(i1, i2));
01797   }
01798 
01800   // idx_spherical_sqrdist
01801 
01802   template<class T> float64 idx_spherical_sqrdist(idx<T> &i1, idx<T> &i2) {
01803     idx_checknelems2_all(i1, i2);
01804     float64 z = 0;
01805     float64 tmp, tmpabs, tmpabs2;
01806     idx_aloopf2(pi1, i1, T, pi2, i2, T, {
01807         tmp = fmod((float64)(*pi1) - (float64)(*pi2), TWOPI);
01808         tmpabs = fabs(tmp);
01809         tmpabs2 = TWOPI - tmpabs;
01810         if (tmpabs > tmpabs2)
01811           tmp = tmpabs2;
01812         z += tmp * tmp;
01813       });
01814     return z;
01815   }
01816 
01818   // idx_gaussian
01819 
01820   template <typename T>
01821   void idx_gaussian(idx<T> &in, double m, double sigma, idx<T> &out) {
01822     idx_aloopf2(i, in, T, o, out, T, {
01823         *o = (T) gaussian((double) (*i), m, sigma);
01824       });
01825   }
01826 
01828   // idx_modulo
01829 
01830   template <class T> void idx_modulo(idx<T> &m, T mod) {
01831     idx_aloopf1(e, m, T, { *e = *e % mod; });
01832   }
01833 
01835   // idx_exp
01836 
01837   template <class T> void idx_exp(idx<T> &m) {
01838     // note: we use float32 instead of 64 because float64 has its own
01839     // specialized template, therefore, calls to this generic template
01840     // will be with types of lower precision than float32, no need for
01841     // float64 precision.
01842     idx_aloopf1(i, m, T, { *i = saturate(exp((float32)*i), T); });
01843   }
01844 
01846   // idx_log
01847 
01848   template <class T> void idx_log(idx<T> &m) {
01849 #if USING_FAST_ITERS == 0
01850     idx_aloop1(i, m, T) {
01851       *i = log(*i);
01852     };
01853 #else
01854     idx_aloopf1(i, m, T, { *i = (T)log(*i); });
01855 #endif
01856   }
01857 
01859   // idx_dot
01860 
01861   template <class T> T idx_dot(idx<T> & in1, idx<T> & in2) {
01862     T z = 0;
01863 #if USING_FAST_ITERS == 0
01864     { idx_aloop2(pi1, in1, T, pi2, in2, T) {
01865         z += *pi1 * *pi2;
01866       }
01867     }
01868 #else
01869     idx_aloopf2(pi1, in1, T, pi2, in2, T, {
01870         z += *pi1 * *pi2;
01871       });
01872 #endif
01873     return z;
01874   }
01875 
01876 // TODO: this doesn't compile on newest gcc
01877 //   template <class T> float64 idx_dot(idx<T> & in1, idx<T> & in2) {
01878 //     float64 z = 0;
01879 // #if USING_FAST_ITERS == 0
01880 //     { idx_aloop2(pi1, in1, T, pi2, in2, T) {
01881 //      z += ((float64)(*pi1)) * ((float64)(*pi2));
01882 //       }
01883 //     }
01884 // #else
01885 //     idx_aloopf2(pi1, in1, T, pi2, in2, T, {
01886 //      z += ((float64)(*pi1)) * ((float64)(*pi2));
01887 //       });
01888 // #endif
01889 //     return z;
01890 //   }
01891 
01893   // idx_dotacc
01894 
01895   template<class T> void idx_dotacc(idx<T> &i1, idx<T> &i2, idx<T> &o) {
01896     if (o.order() != 0)
01897       eblerror("output idx should have order 0");
01898     o.set(o.get() + (T) idx_dot(i1, i2));
01899   }
01900 
01902   // idx_clip
01903 
01904   template<class T> void idx_clip(idx<T> &i1, T m, idx<T> &o1) {
01905     idx_checknelems2_all(i1, o1);
01906 #if USING_FAST_ITERS == 0
01907     { idx_aloop2(i, i1, T, o, o1, T) {
01908         *o = (std::max)(m, *i);
01909       }}
01910 #else
01911     idx_aloopf2(i, i1, T, o, o1, T, { *o = std::max(m, *i); });
01912 #endif
01913   }
01914 
01916   // idx_2dconvol
01917 
01918   template<class T> void idx_2dconvol(idx<T> &in, idx<T> &kernel, idx<T> &out) {
01919     idx_checkorder3(in, 2, kernel, 2, out, 2);
01920     idx<T> uin(in.unfold(0, kernel.dim(0), 1));
01921     uin = uin.unfold(1, kernel.dim(1), 1);
01922     idx_m4dotm2(uin, kernel, out);
01923   }
01924 
01925   template<class T> void idx_3dconvol(idx<T> &in, idx<T> &kernel, idx<T> &out) {
01926     idx_bloop2(i, in, T, o, out, T) {
01927       idx_2dconvol(i, kernel, o);
01928     }
01929   }
01930 
01931   template <typename T>
01932   void idx_m1extm1(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01933     T *c2, *d1, *c1, *c2_0, *d1_0;
01934     intg c2_m0 = i2.mod(0), d1_m1 = o1.mod(1);
01935     intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0);
01936     intg j, jmax = o1.dim(1);
01937     intg i, imax = o1.dim(0);
01938     c1 = i1.idx_ptr();
01939     c2_0 = i2.idx_ptr();
01940     d1_0 = o1.idx_ptr();
01941     for (i=0; i<imax; i++) {
01942       d1 = d1_0;
01943       c2 = c2_0;
01944       for (j=0; j<jmax; j++) {
01945         *d1 = (*c1)*(*c2);
01946         d1 += d1_m1;
01947         c2 += c2_m0;
01948       }
01949       d1_0 += d1_m0;
01950       c1 += c1_m0;
01951     }
01952   }
01953 
01954   template <typename T>
01955   void idx_m1extm1acc(idx<T> &i1, idx<T> &i2, idx<T> &o1) {
01956     T *c2, *d1, *c1, *c2_0, *d1_0;
01957     intg c2_m0 = i2.mod(0), d1_m1 = o1.mod(1);
01958     intg c1_m0 = i1.mod(0), d1_m0 = o1.mod(0);
01959     intg j, jmax = o1.dim(1);
01960     intg i, imax = o1.dim(0);
01961     c1 = i1.idx_ptr();
01962     c2_0 = i2.idx_ptr();
01963     d1_0 = o1.idx_ptr();
01964     for (i=0; i<imax; i++) {
01965       d1 = d1_0;
01966       c2 = c2_0;
01967       for (j=0; j<jmax; j++) {
01968         *d1 += (*c1)*(*c2);
01969         d1 += d1_m1;
01970         c2 += c2_m0;
01971       }
01972       d1_0 += d1_m0;
01973       c1 += c1_m0;
01974     }
01975   }
01976 
01977   template <typename T>
01978   void idx_m2squdotm1(idx<T> &a, idx<T> &x, idx<T> &y) {
01979     eblerror("not implemented");
01980   }
01981 
01982   template <typename T>
01983   void idx_m2squdotm1acc(idx<T> &a, idx<T> &x, idx<T> &y) {
01984     eblerror("not implemented");
01985   }
01986 
01987   template <typename T>
01988   void idx_m1squextm1(idx<T> &x, idx<T> &y, idx<T> &a) {
01989     eblerror("not implemented");
01990   }
01991 
01992   template <typename T>
01993   void idx_m1squextm1acc(idx<T> &x, idx<T> &y, idx<T> &a) {
01994     eblerror("not implemented");
01995   }
01996 
01997   template <typename T>
01998   T idx_trace(idx<T> &m2) {
01999     idx_checkorder1(m2, 2); // must be 2D
02000     if (m2.dim(0) != m2.dim(1))
02001       eblerror("Expected a square matrix, but found: " << m2);
02002     T sum = 0;
02003     for (uint i = 0; i < m2.dim(0); ++i)
02004       sum += m2.get(i, i);
02005     return sum;
02006   }
02007 
02009   // concatenation
02010 
02011   template <typename T>
02012   idx<T> idx_concat(idx<T> &m1, idx<T> &m2, intg dim) {
02013     idxdim d(m1);
02014     // m1 and m2 must have the same order
02015     idx_checkorder2(m1, d.order(), m2, d.order());
02016     // check that all dimensions have the same size except for
02017     for (intg i = 0; i < d.order(); ++i) {
02018       if (i != dim && m1.dim(i) != m2.dim(i))
02019         eblerror("expected same dimensions except for dim " << dim
02020                  << " but found differences: " << m1 << " and " << m2);
02021     }
02022     // allocate and copy
02023     d.setdim(dim, m1.dim(dim) + m2.dim(dim));
02024     idx<T> m3(d);
02025     idx<T> tmp = m3.narrow(dim, m1.dim(dim), 0);
02026     idx_copy(m1, tmp);
02027     tmp = m3.narrow(dim, m2.dim(dim), m1.dim(dim));
02028     idx_copy(m2, tmp);
02029     return m3;
02030   }
02031 
02032   // randomization ////////////////////////////////////////////////////////////
02033 
02034   template <typename T>
02035   void idx_random(idx<T> &m, double v0, double v1) {
02036     idx_aloopf1(mm, m, T, { *mm = (T) drand(v0, v1); });
02037   }
02038 
02039   template <typename T>
02040   void idx_random(idx<T> &m, double v) {
02041     idx_aloopf1(mm, m, T, { *mm = (T) drand(v); });
02042   }
02043 
02044 } // end namespace ebl
02045 
02046 /*
02047 #ifdef __OPENMP__
02048 #include "idxops_openmp.hpp"
02049 #endif
02050 */
02051 
02052 #endif /* IDXOPS_HPP */