Update to 2.0.0 tree from current Fremantle build

[opencv] / src / cxcore / cxarithm.cpp
diff --git a/src/cxcore/cxarithm.cpp b/src/cxcore/cxarithm.cpp

new file mode 100644 (file)

index 0000000..426a347
--- /dev/null
+++ b/src/cxcore/cxarithm.cpp
@@ -0,0 +1,1716 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* ////////////////////////////////////////////////////////////////////
+//
+//  Matrix arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
+//
+// */
+
+#include "_cxcore.h"
+
+namespace cv
+{
+
+#if CV_SSE2
+
+template<class Op8> struct VBinOp8
+{
+    int operator()(const uchar* src1, const uchar* src2, uchar* dst, int len) const
+    {
+        int x = 0;
+        for( ; x <= len - 32; x += 32 )
+        {
+            __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x));
+            __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 16));
+            r0 = op(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
+            r1 = op(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 16)));
+            _mm_storeu_si128((__m128i*)(dst + x), r0);
+            _mm_storeu_si128((__m128i*)(dst + x + 16), r1);
+        }
+        for( ; x <= len - 8; x += 8 )
+        {
+            __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x));
+            r0 = op(r0,_mm_loadl_epi64((const __m128i*)(src2 + x)));
+            _mm_storel_epi64((__m128i*)(dst + x), r0);
+        }
+        return x;
+    }
+    Op8 op;
+};
+
+template<typename T, class Op16> struct VBinOp16
+{
+    int operator()(const T* src1, const T* src2, T* dst, int len) const
+    {
+        int x = 0;
+        for( ; x <= len - 16; x += 16 )
+        {
+            __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x));
+            __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
+            r0 = op(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
+            r1 = op(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 8)));
+            _mm_storeu_si128((__m128i*)(dst + x), r0);
+            _mm_storeu_si128((__m128i*)(dst + x + 8), r1);
+        }
+        for( ; x <= len - 4; x += 4 )
+        {
+            __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x));
+            r0 = op(r0,_mm_loadl_epi64((const __m128i*)(src2 + x)));
+            _mm_storel_epi64((__m128i*)(dst + x), r0);
+        }
+        return x;
+    }
+    Op16 op;
+};
+
+template<class Op32f> struct VBinOp32f
+{
+    int operator()(const float* src1, const float* src2, float* dst, int len) const
+    {
+        int x = 0;
+        if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
+            for( ; x <= len - 8; x += 8 )
+            {
+                __m128 r0 = _mm_load_ps(src1 + x);
+                __m128 r1 = _mm_load_ps(src1 + x + 4);
+                r0 = op(r0,_mm_load_ps(src2 + x));
+                r1 = op(r1,_mm_load_ps(src2 + x + 4));
+                _mm_store_ps(dst + x, r0);
+                _mm_store_ps(dst + x + 4, r1);
+            }
+        else
+            for( ; x <= len - 8; x += 8 )
+            {
+                __m128 r0 = _mm_loadu_ps(src1 + x);
+                __m128 r1 = _mm_loadu_ps(src1 + x + 4);
+                r0 = op(r0,_mm_loadu_ps(src2 + x));
+                r1 = op(r1,_mm_loadu_ps(src2 + x + 4));
+                _mm_storeu_ps(dst + x, r0);
+                _mm_storeu_ps(dst + x + 4, r1);
+            }
+        return x;
+    }
+    Op32f op;
+};
+
+struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }};
+struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }};
+struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }};
+struct _VMax8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }};
+struct _VCmpGT8u { __m128i operator()(const __m128i& a, const __m128i& b) const
+{
+    __m128i delta = _mm_set1_epi32(0x80808080);
+    return _mm_cmpgt_epi8(_mm_xor_si128(a,delta),_mm_xor_si128(b,delta));
+}};
+struct _VCmpEQ8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_cmpeq_epi8(a,b); }};
+struct _VAbsDiff8u
+{
+    __m128i operator()(const __m128i& a, const __m128i& b) const
+    { return _mm_add_epi8(_mm_subs_epu8(a,b),_mm_subs_epu8(b,a)); }
+};
+struct _VAdd16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu16(a,b); }};
+struct _VSub16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu16(a,b); }};
+struct _VMin16u
+{
+    __m128i operator()(const __m128i& a, const __m128i& b) const
+    { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); }
+};
+struct _VMax16u
+{
+    __m128i operator()(const __m128i& a, const __m128i& b) const
+    { return _mm_adds_epu16(_mm_subs_epu16(a,b),b); }
+};
+struct _VAbsDiff16u
+{
+    __m128i operator()(const __m128i& a, const __m128i& b) const
+    { return _mm_add_epi16(_mm_subs_epu16(a,b),_mm_subs_epu16(b,a)); }
+};
+struct _VAdd16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epi16(a,b); }};
+struct _VSub16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epi16(a,b); }};
+struct _VMin16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epi16(a,b); }};
+struct _VMax16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epi16(a,b); }};
+struct _VAbsDiff16s
+{
+    __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        __m128i M = _mm_max_epi16(a,b), m = _mm_min_epi16(a,b);
+        return _mm_subs_epi16(M, m);
+    }
+};
+struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }};
+struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }};
+struct _VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }};
+struct _VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }};
+static const __m128i v32f_absmask = _mm_set1_epi32(0x7fffffff);
+struct _VAbsDiff32f
+{
+    __m128 operator()(const __m128& a, const __m128& b) const
+    {
+        return _mm_and_ps(_mm_sub_ps(a,b), (__m128&)v32f_absmask);
+    }
+};
+
+struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }};
+struct _VOr8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }};
+struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }};
+
+typedef VBinOp8<_VAdd8u> VAdd8u;
+typedef VBinOp8<_VSub8u> VSub8u;
+typedef VBinOp8<_VMin8u> VMin8u;
+typedef VBinOp8<_VMax8u> VMax8u;
+typedef VBinOp8<_VAbsDiff8u> VAbsDiff8u;
+typedef VBinOp8<_VCmpEQ8u> VCmpEQ8u;
+typedef VBinOp8<_VCmpGT8u> VCmpGT8u;
+
+typedef VBinOp16<ushort, _VAdd16u> VAdd16u;
+typedef VBinOp16<ushort, _VSub16u> VSub16u;
+typedef VBinOp16<ushort, _VMin16u> VMin16u;
+typedef VBinOp16<ushort, _VMax16u> VMax16u;
+typedef VBinOp16<ushort, _VAbsDiff16u> VAbsDiff16u;
+
+typedef VBinOp16<short, _VAdd16s> VAdd16s;
+typedef VBinOp16<short, _VSub16s> VSub16s;
+typedef VBinOp16<short, _VMin16s> VMin16s;
+typedef VBinOp16<short, _VMax16s> VMax16s;
+typedef VBinOp16<short, _VAbsDiff16s> VAbsDiff16s;
+
+typedef VBinOp32f<_VAdd32f> VAdd32f;
+typedef VBinOp32f<_VSub32f> VSub32f;
+typedef VBinOp32f<_VMin32f> VMin32f;
+typedef VBinOp32f<_VMax32f> VMax32f;
+typedef VBinOp32f<_VAbsDiff32f> VAbsDiff32f;
+
+typedef VBinOp8<_VAnd8u> VAnd8u;
+typedef VBinOp8<_VOr8u> VOr8u;
+typedef VBinOp8<_VXor8u> VXor8u;
+
+#else
+
+typedef NoVec VAdd8u;
+typedef NoVec VSub8u;
+typedef NoVec VMin8u;
+typedef NoVec VMax8u;
+typedef NoVec VAbsDiff8u;
+typedef NoVec VCmpEQ8u;
+typedef NoVec VCmpGT8u;
+
+typedef NoVec VAdd16u;
+typedef NoVec VSub16u;
+typedef NoVec VMin16u;
+typedef NoVec VMax16u;
+typedef NoVec VAbsDiff16u;
+
+typedef NoVec VAdd16s;
+typedef NoVec VSub16s;
+typedef NoVec VMin16s;
+typedef NoVec VMax16s;
+typedef NoVec VAbsDiff16s;
+
+typedef NoVec VAdd32f;
+typedef NoVec VSub32f;
+typedef NoVec VMin32f;
+typedef NoVec VMax32f;
+typedef NoVec VAbsDiff32f;
+
+typedef NoVec VAnd8u;
+typedef NoVec VOr8u;
+typedef NoVec VXor8u;
+
+#endif
+
+/****************************************************************************************\
+*                                   logical operations                                   *
+\****************************************************************************************/
+
+template<typename T> struct AndOp
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OrOp
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct XorOp
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<class OPB, class OPI, class OPV> static void
+bitwiseOp_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat )
+{
+    OPB opb; OPI opi; OPV opv;
+    const uchar* src1 = srcmat1.data;
+    const uchar* src2 = srcmat2.data;
+    uchar* dst = dstmat.data;
+    size_t step1 = srcmat1.step, step2 = srcmat2.step, step = dstmat.step;
+    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, (int)srcmat1.elemSize() );
+
+    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = opv(src1, src2, dst, size.width);
+
+        if( (((size_t)src1 | (size_t)src2 | (size_t)dst) & 3) == 0 )
+        {
+            for( ; i <= size.width - 16; i += 16 )
+            {
+                int t0 = opi(((const int*)(src1+i))[0], ((const int*)(src2+i))[0]);
+                int t1 = opi(((const int*)(src1+i))[1], ((const int*)(src2+i))[1]);
+
+                ((int*)(dst+i))[0] = t0;
+                ((int*)(dst+i))[1] = t1;
+
+                t0 = opi(((const int*)(src1+i))[2], ((const int*)(src2+i))[2]);
+                t1 = opi(((const int*)(src1+i))[3], ((const int*)(src2+i))[3]);
+
+                ((int*)(dst+i))[2] = t0;
+                ((int*)(dst+i))[3] = t1;
+            }
+
+            for( ; i <= size.width - 4; i += 4 )
+            {
+                int t = opi(*(const int*)(src1+i), *(const int*)(src2+i));
+                *(int*)(dst+i) = t;
+            }
+        }
+
+        for( ; i < size.width; i++ )
+            dst[i] = opb(src1[i], src2[i]);
+    }
+}
+
+
+template<class OPB, class OPI, class OPV> static void
+bitwiseSOp_( const Mat& srcmat, Mat& dstmat, const Scalar& _scalar )
+{
+    OPB opb; OPI opi; OPV opv;
+    const uchar* src0 = srcmat.data;
+    uchar* dst0 = dstmat.data;
+    size_t step1 = srcmat.step, step = dstmat.step;
+    Size size = getContinuousSize( srcmat, dstmat, (int)srcmat.elemSize() );
+    const int delta = 96;
+    uchar scalar[delta];
+    scalarToRawData(_scalar, scalar, srcmat.type(), (int)(delta/srcmat.elemSize1()) );
+
+    for( ; size.height--; src0 += step1, dst0 += step )
+    {
+        const uchar* src = (const uchar*)src0;
+        uchar* dst = dst0;
+        int i, len = size.width;
+
+        if( (((size_t)src|(size_t)dst) & 3) == 0 )
+        {
+            while( (len -= delta) >= 0 )
+            {
+                i = opv(src, scalar, dst, delta);
+                for( ; i < delta; i += 16 )
+                {
+                    int t0 = opi(((const int*)(src+i))[0], ((const int*)(scalar+i))[0]);
+                    int t1 = opi(((const int*)(src+i))[1], ((const int*)(scalar+i))[1]);
+                    ((int*)(dst+i))[0] = t0;
+                    ((int*)(dst+i))[1] = t1;
+
+                    t0 = opi(((const int*)(src+i))[2], ((const int*)(scalar+i))[2]);
+                    t1 = opi(((const int*)(src+i))[3], ((const int*)(scalar+i))[3]);
+                    ((int*)(dst+i))[2] = t0;
+                    ((int*)(dst+i))[3] = t1;
+                }
+                src += delta;
+                dst += delta;
+            }
+        }
+        else
+        {
+            while( (len -= delta) >= 0 )
+            {
+                for( i = 0; i < delta; i += 4 )
+                {
+                    uchar t0 = opb(src[i], scalar[i]);
+                    uchar t1 = opb(src[i+1], scalar[i+1]);
+                    dst[i] = t0; dst[i+1] = t1;
+
+                    t0 = opb(src[i+2], scalar[i+2]);
+                    t1 = opb(src[i+3], scalar[i+3]);
+                    dst[i+2] = t0; dst[i+3] = t1;
+                }
+                src += delta;
+                dst += delta;
+            }
+        }
+
+        for( len += delta, i = 0; i < len; i++ )
+            dst[i] = opb(src[i],scalar[i]);
+    }
+}
+
+static void
+binaryMaskOp( const Mat& src1, const Mat& src2, Mat& dst,
+              const Mat& mask, BinaryFunc func )
+{
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
+    dst.create( src1.size(), src1.type() );
+
+    if( !mask.data )
+        func(src1, src2, dst);
+    else
+    {
+        AutoBuffer<uchar> buf;
+        size_t esz = dst.elemSize(), buf_step = dst.cols*esz;
+        CopyMaskFunc copym_func = getCopyMaskFunc((int)esz);
+        int y, dy;
+
+        CV_Assert(mask.type() == CV_8UC1 && mask.size() == dst.size());
+        dy = std::min(std::max((int)(CV_MAX_LOCAL_SIZE/buf_step), 1), dst.rows);
+        buf.allocate( buf_step*dy );
+
+        for( y = 0; y < dst.rows; y += dy )
+        {
+            dy = std::min(dy, dst.rows - y);
+            Mat dstpart = dst.rowRange(y, y + dy);
+            Mat temp(dy, dst.cols, dst.type(), (uchar*)buf );
+            func( src1.rowRange(y, y + dy), src2.rowRange(y, y + dy), temp );
+            copym_func( temp, dstpart, mask.rowRange(y, y + dy) );
+        }
+    }
+}
+
+
+static void
+binarySMaskOp( const Mat& src1, const Scalar& s, Mat& dst,
+               const Mat& mask, BinarySFuncCn func )
+{
+    CV_Assert( func != 0 );
+    dst.create( src1.size(), src1.type() );
+
+    if( !mask.data )
+        func(src1, dst, s);
+    else
+    {
+        AutoBuffer<uchar> buf;
+        size_t esz = dst.elemSize(), buf_step = dst.cols*esz;
+        CopyMaskFunc copym_func = getCopyMaskFunc((int)esz);
+        int y, dy;
+
+        CV_Assert(mask.type() == CV_8UC1 && mask.size() == dst.size());
+        dy = std::min(std::max((int)(CV_MAX_LOCAL_SIZE/buf_step), 1), dst.rows);
+        buf.allocate( buf_step*dy );
+
+        for( y = 0; y < dst.rows; y += dy )
+        {
+            dy = std::min(dy, dst.rows - y);
+            Mat dstpart = dst.rowRange(y, y + dy);
+            Mat temp(dy, dst.cols, dst.type(), (uchar*)buf);
+            func( src1.rowRange(y, y + dy), temp, s );
+            copym_func( temp, dstpart, mask.rowRange(y, y + dy) );
+        }
+    }
+}
+
+
+void bitwise_and(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
+{
+    binaryMaskOp(a, b, c, mask, bitwiseOp_<AndOp<uchar>, AndOp<int>, VAnd8u>);
+}
+
+void bitwise_or(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
+{
+    binaryMaskOp(a, b, c, mask, bitwiseOp_<OrOp<uchar>, OrOp<int>, VOr8u>);
+}
+
+void bitwise_xor(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
+{
+    binaryMaskOp(a, b, c, mask, bitwiseOp_<XorOp<uchar>, XorOp<int>, VXor8u>);
+}
+
+void bitwise_and(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
+{
+    binarySMaskOp(a, s, c, mask,
+        bitwiseSOp_<AndOp<uchar>, AndOp<int>, VAnd8u>);
+}
+
+void bitwise_or(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
+{
+    binarySMaskOp(a, s, c, mask,
+        bitwiseSOp_<OrOp<uchar>, OrOp<int>, VOr8u>);
+}
+
+void bitwise_xor(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
+{
+    binarySMaskOp(a, s, c, mask,
+        bitwiseSOp_<XorOp<uchar>, XorOp<int>, VXor8u>);
+}
+
+
+void bitwise_not(const Mat& src, Mat& dst)
+{
+    const uchar* sptr = src.data;
+    uchar* dptr = dst.data;
+    dst.create( src.size(), src.type() );
+    Size size = getContinuousSize( src, dst, (int)src.elemSize() );
+
+    for( ; size.height--; sptr += src.step, dptr += dst.step )
+    {
+        int i = 0;
+        if( (((size_t)sptr | (size_t)dptr) & 3) == 0 )
+        {
+            for( ; i <= size.width - 16; i += 16 )
+            {
+                int t0 = ~((const int*)(sptr+i))[0];
+                int t1 = ~((const int*)(sptr+i))[1];
+
+                ((int*)(dptr+i))[0] = t0;
+                ((int*)(dptr+i))[1] = t1;
+
+                t0 = ~((const int*)(sptr+i))[2];
+                t1 = ~((const int*)(sptr+i))[3];
+
+                ((int*)(dptr+i))[2] = t0;
+                ((int*)(dptr+i))[3] = t1;
+            }
+
+            for( ; i <= size.width - 4; i += 4 )
+                *(int*)(dptr+i) = ~*(const int*)(sptr+i);
+        }
+
+        for( ; i < size.width; i++ )
+        {
+            dptr[i] = (uchar)(~sptr[i]);
+        }
+    }
+}
+
+/****************************************************************************************\
+*                                      add/subtract                                      *
+\****************************************************************************************/
+
+template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a + b); }
+template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a - b); }
+
+static BinaryFunc addTab[] =
+{
+    binaryOpC1_<OpAdd<uchar>,VAdd8u>, 0,
+    binaryOpC1_<OpAdd<ushort>,VAdd16u>,
+    binaryOpC1_<OpAdd<short>,VAdd16s>,
+    binaryOpC1_<OpAdd<int>,NoVec>,
+    binaryOpC1_<OpAdd<float>,VAdd32f>,
+    binaryOpC1_<OpAdd<double>,NoVec>, 0
+};
+
+static BinaryFunc subTab[] =
+{
+    binaryOpC1_<OpSub<uchar>,VSub8u>, 0,
+    binaryOpC1_<OpSub<ushort>,VSub16u>,
+    binaryOpC1_<OpSub<short>,VSub16s>,
+    binaryOpC1_<OpSub<int>,NoVec>,
+    binaryOpC1_<OpSub<float>,VSub32f>,
+    binaryOpC1_<OpSub<double>,NoVec>, 0
+};
+
+
+void add( const Mat& src1, const Mat& src2, Mat& dst )
+{
+    Size size = src1.size(); int type = src1.type();
+    BinaryFunc func = addTab[CV_MAT_DEPTH(type)];
+    CV_Assert( size == src2.size() && type == src2.type() && func != 0 );
+    dst.create( size, type );
+    func(src1, src2, dst);
+}
+
+void subtract( const Mat& src1, const Mat& src2, Mat& dst )
+{
+    Size size = src1.size(); int type = src1.type();
+    BinaryFunc func = subTab[CV_MAT_DEPTH(type)];
+    CV_Assert( size == src2.size() && type == src2.type() && func != 0 );
+    dst.create( size, type );
+    func(src1, src2, dst);
+}
+
+void subtract(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
+{
+    add(a, -s, c, mask);
+}
+
+void add(const Mat& src1, const Mat& src2, Mat& dst, const Mat& mask)
+{
+    binaryMaskOp(src1, src2, dst, mask, addTab[src1.depth()] );
+}
+
+void subtract(const Mat& src1, const Mat& src2, Mat& dst, const Mat& mask)
+{
+    binaryMaskOp(src1, src2, dst, mask, subTab[src1.depth()] );
+}
+
+void add(const Mat& src1, const Scalar& s, Mat& dst, const Mat& mask)
+{
+    static BinarySFuncCn addSTab[] =
+    {
+        binarySOpCn_<OpAdd<uchar, int, uchar> >, 0,
+        binarySOpCn_<OpAdd<ushort, int, ushort> >,
+        binarySOpCn_<OpAdd<short, int, short> >,
+        binarySOpCn_<OpAdd<int> >,
+        binarySOpCn_<OpAdd<float> >,
+        binarySOpCn_<OpAdd<double> >, 0
+    };
+    int depth = src1.depth();
+    binarySMaskOp(src1, s, dst, mask, addSTab[depth]);
+}
+
+void subtract(const Scalar& s, const Mat& src1, Mat& dst, const Mat& mask)
+{
+    static BinarySFuncCn rsubSTab[] =
+    {
+        binarySOpCn_<OpRSub<uchar, int, uchar> >, 0,
+        binarySOpCn_<OpRSub<ushort, int, ushort> >,
+        binarySOpCn_<OpRSub<short, int, short> >,
+        binarySOpCn_<OpRSub<int> >,
+        binarySOpCn_<OpRSub<float> >,
+        binarySOpCn_<OpRSub<double> >, 0
+    };
+    int depth = src1.depth();
+    binarySMaskOp(src1, s, dst, mask, rsubSTab[depth]);
+}
+
+/****************************************************************************************\
+*                                    multiply/divide                                     *
+\****************************************************************************************/
+
+template<typename T, typename WT> static void
+mul_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat, double _scale )
+{
+    const T* src1 = (const T*)srcmat1.data;
+    const T* src2 = (const T*)srcmat2.data;
+    T* dst = (T*)dstmat.data;
+    size_t step1 = srcmat1.step/sizeof(src1[0]);
+    size_t step2 = srcmat2.step/sizeof(src2[0]);
+    size_t step = dstmat.step/sizeof(dst[0]);
+    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );
+
+    if( fabs(_scale - 1.) < DBL_EPSILON )
+    {
+        for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
+        {
+            int i;
+            for( i = 0; i <= size.width - 4; i += 4 )
+            {
+                T t0 = saturate_cast<T>(src1[i] * src2[i]);
+                T t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
+                dst[i] = t0; dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
+                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
+                dst[i+2] = t0; dst[i+3] = t1;
+            }
+
+            for( ; i < size.width; i++ )
+                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
+        }
+    }
+    else
+    {
+        WT scale = (WT)_scale;
+        for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
+        {
+            int i;
+            for( i = 0; i <= size.width - 4; i += 4 )
+            {
+                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
+                dst[i] = t0; dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
+                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
+                dst[i+2] = t0; dst[i+3] = t1;
+            }
+
+            for( ; i < size.width; i++ )
+                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+        }
+    }
+}
+
+typedef void (*MulDivFunc)( const Mat& src1, const Mat& src2,
+                            Mat& dst, double scale );
+
+void multiply(const Mat& src1, const Mat& src2, Mat& dst, double scale)
+{
+    static MulDivFunc tab[] =
+    {
+        mul_<uchar, float>, 0, mul_<ushort, float>, mul_<short, float>,
+        mul_<int, double>, mul_<float, float>, mul_<double, double>, 0
+    };
+
+    MulDivFunc func = tab[src1.depth()];
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
+    dst.create( src1.size(), src1.type() );
+    func( src1, src2, dst, scale );
+}
+
+
+template<typename T> static void
+div_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat, double scale )
+{
+    const T* src1 = (const T*)srcmat1.data;
+    const T* src2 = (const T*)srcmat2.data;
+    T* dst = (T*)dstmat.data;
+    size_t step1 = srcmat1.step/sizeof(src1[0]);
+    size_t step2 = srcmat2.step/sizeof(src2[0]);
+    size_t step = dstmat.step/sizeof(dst[0]);
+    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );
+
+    for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
+    {
+        int i = 0;
+        for( ; i <= size.width - 4; i += 4 )
+        {
+            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
+            {
+                double a = (double)src2[i] * src2[i+1];
+                double b = (double)src2[i+2] * src2[i+3];
+                double d = scale/(a * b);
+                b *= d;
+                a *= d;
+
+                T z0 = saturate_cast<T>(src2[i+1] * src1[i] * b);
+                T z1 = saturate_cast<T>(src2[i] * src1[i+1] * b);
+                T z2 = saturate_cast<T>(src2[i+3] * src1[i+2] * a);
+                T z3 = saturate_cast<T>(src2[i+2] * src1[i+3] * a);
+
+                dst[i] = z0; dst[i+1] = z1;
+                dst[i+2] = z2; dst[i+3] = z3;
+            }
+            else
+            {
+                T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
+                T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
+                T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
+                T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
+
+                dst[i] = z0; dst[i+1] = z1;
+                dst[i+2] = z2; dst[i+3] = z3;
+            }
+        }
+
+        for( ; i < size.width; i++ )
+            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
+    }
+}
+
+
+void divide(const Mat& src1, const Mat& src2, Mat& dst, double scale)
+{
+    static MulDivFunc tab[] =
+    {
+        div_<uchar>, 0, div_<ushort>, div_<short>,
+        div_<int>, div_<float>, div_<double>, 0
+    };
+
+    MulDivFunc func = tab[src1.depth()];
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
+    dst.create( src1.size(), src1.type() );
+    func( src1, src2, dst, scale );
+}
+
+template<typename T> static void
+recip_( double scale, const Mat& srcmat2, Mat& dstmat )
+{
+    const T* src2 = (const T*)srcmat2.data;
+    T* dst = (T*)dstmat.data;
+    size_t step2 = srcmat2.step/sizeof(src2[0]);
+    size_t step = dstmat.step/sizeof(dst[0]);
+    Size size = getContinuousSize( srcmat2, dstmat, dstmat.channels() );
+
+    for( ; size.height--; src2+=step2, dst+=step )
+    {
+        int i = 0;
+        for( ; i <= size.width - 4; i += 4 )
+        {
+            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
+            {
+                double a = (double)src2[i] * src2[i+1];
+                double b = (double)src2[i+2] * src2[i+3];
+                double d = scale/(a * b);
+                b *= d;
+                a *= d;
+
+                T z0 = saturate_cast<T>(src2[i+1] * b);
+                T z1 = saturate_cast<T>(src2[i] * b);
+                T z2 = saturate_cast<T>(src2[i+3] * a);
+                T z3 = saturate_cast<T>(src2[i+2] * a);
+
+                dst[i] = z0; dst[i+1] = z1;
+                dst[i+2] = z2; dst[i+3] = z3;
+            }
+            else
+            {
+                T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
+                T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
+                T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
+                T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
+
+                dst[i] = z0; dst[i+1] = z1;
+                dst[i+2] = z2; dst[i+3] = z3;
+            }
+        }
+
+        for( ; i < size.width; i++ )
+            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
+    }
+}
+
+typedef void (*RecipFunc)( double scale, const Mat& src, Mat& dst );
+
+void divide(double scale, const Mat& src, Mat& dst)
+{
+    static RecipFunc tab[] =
+    {
+        recip_<uchar>, 0, recip_<ushort>, recip_<short>,
+        recip_<int>, recip_<float>, recip_<double>, 0
+    };
+
+    RecipFunc func = tab[src.depth()];
+    CV_Assert( func != 0 );
+    dst.create( src.size(), src.type() );
+    func( scale, src, dst );
+}
+
+/****************************************************************************************\
+*                                      addWeighted                                       *
+\****************************************************************************************/
+
+template<typename T, typename WT> static void
+addWeighted_( const Mat& srcmat1, double _alpha, const Mat& srcmat2,
+              double _beta, double _gamma, Mat& dstmat )
+{
+    const T* src1 = (const T*)srcmat1.data;
+    const T* src2 = (const T*)srcmat2.data;
+    T* dst = (T*)dstmat.data;
+    size_t step1 = srcmat1.step/sizeof(src1[0]);
+    size_t step2 = srcmat2.step/sizeof(src2[0]);
+    size_t step = dstmat.step/sizeof(dst[0]);
+    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );
+    WT alpha = (WT)_alpha, beta = (WT)_beta, gamma = (WT)_gamma;
+
+    for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
+    {
+        int i = 0;
+        for( ; i <= size.width - 4; i += 4 )
+        {
+            T t0 = saturate_cast<T>(src1[i]*alpha + src2[i]*beta + gamma);
+            T t1 = saturate_cast<T>(src1[i+1]*alpha + src2[i+1]*beta + gamma);
+            dst[i] = t0; dst[i+1] = t1;
+
+            t0 = saturate_cast<T>(src1[i+2]*alpha + src2[i+2]*beta + gamma);
+            t1 = saturate_cast<T>(src1[i+3]*alpha + src2[i+3]*beta + gamma);
+            dst[i+2] = t0; dst[i+3] = t1;
+        }
+
+        for( ; i < size.width; i++ )
+            dst[i] = saturate_cast<T>(src1[i]*alpha + src2[i]*beta + gamma);
+    }
+}
+
+
+static void
+addWeighted8u( const Mat& srcmat1, double alpha,
+               const Mat& srcmat2, double beta,
+               double gamma, Mat& dstmat )
+{
+    const int shift = 14;
+    if( srcmat1.rows*srcmat1.cols*srcmat1.channels() <= 256 ||
+        fabs(alpha) > 256 || fabs(beta) > 256 || fabs(gamma) > 256*256 )
+    {
+        addWeighted_<uchar, float>(srcmat1, alpha, srcmat2, beta, gamma, dstmat);
+        return;
+    }
+    const uchar* src1 = srcmat1.data;
+    const uchar* src2 = srcmat2.data;
+    uchar* dst = dstmat.data;
+    size_t step1 = srcmat1.step;
+    size_t step2 = srcmat2.step;
+    size_t step = dstmat.step;
+    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );
+
+    int tab1[256], tab2[256];
+    double t = 0;
+    int j, t0, t1, t2, t3;
+
+    alpha *= 1 << shift;
+    gamma = gamma*(1 << shift) + (1 << (shift - 1));
+    beta *= 1 << shift;
+
+    for( j = 0; j < 256; j++ )
+    {
+        tab1[j] = cvRound(t);
+        tab2[j] = cvRound(gamma);
+        t += alpha;
+        gamma += beta;
+    }
+
+    t0 = (tab1[0] + tab2[0]) >> shift;
+    t1 = (tab1[0] + tab2[255]) >> shift;
+    t2 = (tab1[255] + tab2[0]) >> shift;
+    t3 = (tab1[255] + tab2[255]) >> shift;
+
+    if( (unsigned)(t0+256) < 768 && (unsigned)(t1+256) < 768 &&
+        (unsigned)(t2+256) < 768 && (unsigned)(t3+256) < 768 )
+    {
+        // use faster table-based convertion back to 8u
+        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i;
+
+            for( i = 0; i <= size.width - 4; i += 4 )
+            {
+                t0 = CV_FAST_CAST_8U((tab1[src1[i]] + tab2[src2[i]]) >> shift);
+                t1 = CV_FAST_CAST_8U((tab1[src1[i+1]] + tab2[src2[i+1]]) >> shift);
+
+                dst[i] = (uchar)t0;
+                dst[i+1] = (uchar)t1;
+
+                t0 = CV_FAST_CAST_8U((tab1[src1[i+2]] + tab2[src2[i+2]]) >> shift);
+                t1 = CV_FAST_CAST_8U((tab1[src1[i+3]] + tab2[src2[i+3]]) >> shift);
+
+                dst[i+2] = (uchar)t0;
+                dst[i+3] = (uchar)t1;
+            }
+
+            for( ; i < size.width; i++ )
+            {
+                t0 = CV_FAST_CAST_8U((tab1[src1[i]] + tab2[src2[i]]) >> shift);
+                dst[i] = (uchar)t0;
+            }
+        }
+    }
+    else
+    {
+        // use universal macro for convertion back to 8u
+        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i;
+
+            for( i = 0; i <= size.width - 4; i += 4 )
+            {
+                t0 = (tab1[src1[i]] + tab2[src2[i]]) >> shift;
+                t1 = (tab1[src1[i+1]] + tab2[src2[i+1]]) >> shift;
+
+                dst[i] = CV_CAST_8U( t0 );
+                dst[i+1] = CV_CAST_8U( t1 );
+
+                t0 = (tab1[src1[i+2]] + tab2[src2[i+2]]) >> shift;
+                t1 = (tab1[src1[i+3]] + tab2[src2[i+3]]) >> shift;
+
+                dst[i+2] = CV_CAST_8U( t0 );
+                dst[i+3] = CV_CAST_8U( t1 );
+            }
+
+            for( ; i < size.width; i++ )
+            {
+                t0 = (tab1[src1[i]] + tab2[src2[i]]) >> shift;
+                dst[i] = CV_CAST_8U( t0 );
+            }
+        }
+    }
+}
+
+typedef void (*AddWeightedFunc)( const Mat& src1, double alpha, const Mat& src2,
+                                 double beta, double gamma, Mat& dst );
+
+void addWeighted( const Mat& src1, double alpha, const Mat& src2,
+                  double beta, double gamma, Mat& dst )
+{
+    static AddWeightedFunc tab[]=
+    {
+        addWeighted8u, 0, addWeighted_<ushort, float>, addWeighted_<short, float>,
+        addWeighted_<int, double>, addWeighted_<float, float>, addWeighted_<double, double>, 0
+    };
+
+    AddWeightedFunc func = tab[src1.depth()];
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
+    dst.create( src1.size(), src1.type() );
+    func( src1, alpha, src2, beta, gamma, dst );
+}
+
+
+/****************************************************************************************\
+*                                      absdiff                                           *
+\****************************************************************************************/
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) { return (T)std::abs(a - b); }
+};
+
+template<> inline short OpAbsDiff<short>::operator ()(short a, short b)
+{ return saturate_cast<short>(std::abs(a - b)); }
+
+template<typename T, typename WT=T> struct OpAbsDiffS
+{
+    typedef T type1;
+    typedef WT type2;
+    typedef T rtype;
+    T operator()(T a, WT b) { return saturate_cast<T>(std::abs(a - b)); }
+};
+
+void absdiff( const Mat& src1, const Mat& src2, Mat& dst )
+{
+    static BinaryFunc tab[] =
+    {
+        binaryOpC1_<OpAbsDiff<uchar>,VAbsDiff8u>, 0,
+        binaryOpC1_<OpAbsDiff<ushort>,VAbsDiff16u>,
+        binaryOpC1_<OpAbsDiff<short>,VAbsDiff16s>,
+        binaryOpC1_<OpAbsDiff<int>,NoVec>,
+        binaryOpC1_<OpAbsDiff<float>,VAbsDiff32f>,
+        binaryOpC1_<OpAbsDiff<double>,NoVec>, 0
+    };
+
+    dst.create(src1.size(), src1.type());
+    BinaryFunc func = tab[src1.depth()];
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
+    func( src1, src2, dst );
+}
+
+
+void absdiff( const Mat& src1, const Scalar& s, Mat& dst )
+{
+    static BinarySFuncCn tab[] =
+    {
+        binarySOpCn_<OpAbsDiffS<uchar, int> >, 0,
+        binarySOpCn_<OpAbsDiffS<ushort, int> >,
+        binarySOpCn_<OpAbsDiffS<short, int> >,
+        binarySOpCn_<OpAbsDiffS<int> >,
+        binarySOpCn_<OpAbsDiffS<float> >,
+        binarySOpCn_<OpAbsDiffS<double> >, 0
+    };
+
+    dst.create(src1.size(), src1.type());
+    BinarySFuncCn func = tab[src1.depth()];
+    CV_Assert(src1.channels() <= 4 && func != 0);
+    func( src1, dst, s );
+}
+
+/****************************************************************************************\
+*                                      inRange[S]                                        *
+\****************************************************************************************/
+
+template<typename T, typename WT> struct InRangeC1
+{
+    typedef T xtype;
+    typedef WT btype;
+    uchar operator()(xtype x, btype a, btype b) const
+    { return (uchar)-(a <= x && x < b); }
+};
+
+template<typename T, typename WT> struct InRangeC2
+{
+    typedef Vec<T,2> xtype;
+    typedef Vec<WT,2> btype;
+    uchar operator()(const xtype& x, const btype& a, const btype& b) const
+    {
+        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
+                        a[1] <= x[1] && x[1] < b[1]);
+    }
+};
+
+template<typename T, typename WT> struct InRangeC3
+{
+    typedef Vec<T,3> xtype;
+    typedef Vec<WT,3> btype;
+    uchar operator()(const xtype& x, const btype& a, const btype& b) const
+    {
+        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
+                        a[1] <= x[1] && x[1] < b[1] &&
+                        a[2] <= x[2] && x[2] < b[2]);
+    }
+};
+
+template<typename T, typename WT> struct InRangeC4
+{
+    typedef Vec<T,4> xtype;
+    typedef Vec<WT,4> btype;
+    uchar operator()(const xtype& x, const btype& a, const btype& b) const
+    {
+        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
+                        a[1] <= x[1] && x[1] < b[1] &&
+                        a[2] <= x[2] && x[2] < b[2] &&
+                        a[3] <= x[3] && x[3] < b[3]);
+    }
+};
+
+template<class Op> static void
+inRange_( const Mat& srcmat1, const Mat& srcmat2, const Mat& srcmat3, Mat& dstmat )
+{
+    Op op;
+    uchar* dst = dstmat.data;
+    size_t dstep = dstmat.step;
+    Size size = getContinuousSize( srcmat1, srcmat2, srcmat3, dstmat );
+
+    for( int y = 0; y < size.height; y++, dst += dstep )
+    {
+        const typename Op::xtype* src1 = (const typename Op::xtype*)(srcmat1.data + srcmat1.step*y);
+        const typename Op::xtype* src2 = (const typename Op::xtype*)(srcmat2.data + srcmat2.step*y);
+        const typename Op::xtype* src3 = (const typename Op::xtype*)(srcmat3.data + srcmat3.step*y);
+        for( int x = 0; x < size.width; x++ )
+            dst[x] = op( src1[x], src2[x], src3[x] );
+    }
+}
+
+template<class Op> static void
+inRangeS_( const Mat& srcmat1, const Scalar& _a, const Scalar& _b, Mat& dstmat )
+{
+    Op op;
+    typedef typename Op::btype WT;
+    typedef typename DataType<WT>::channel_type WT1;
+    WT a, b;
+    uchar* dst = dstmat.data;
+    size_t dstep = dstmat.step;
+    Size size = getContinuousSize( srcmat1, dstmat );
+    int cn = srcmat1.channels();
+    _a.convertTo((WT1*)&a, cn);
+    _b.convertTo((WT1*)&b, cn);
+
+    for( int y = 0; y < size.height; y++, dst += dstep )
+    {
+        const typename Op::xtype* src1 = (const typename Op::xtype*)(srcmat1.data + srcmat1.step*y);
+        for( int x = 0; x < size.width; x++ )
+            dst[x] = op( src1[x], a, b );
+    }
+}
+
+typedef void (*InRangeFunc)( const Mat& src1, const Mat& src2, const Mat& src3, Mat& dst );
+typedef void (*InRangeSFunc)( const Mat& src1, const Scalar& a, const Scalar& b, Mat& dst );
+
+void inRange(const Mat& src, const Mat& lowerb,
+             const Mat& upperb, Mat& dst)
+{
+    static InRangeFunc tab[] =
+    {
+        inRange_<InRangeC1<uchar, uchar> >, 0,
+        inRange_<InRangeC1<ushort, ushort> >,
+        inRange_<InRangeC1<short, short> >,
+        inRange_<InRangeC1<int, int> >,
+        inRange_<InRangeC1<float, float> >,
+        inRange_<InRangeC1<double, double> >, 0,
+
+        inRange_<InRangeC2<uchar, uchar> >, 0,
+        inRange_<InRangeC2<ushort, ushort> >,
+        inRange_<InRangeC2<short, short> >,
+        inRange_<InRangeC2<int, int> >,
+        inRange_<InRangeC2<float, float> >,
+        inRange_<InRangeC2<double, double> >, 0,
+
+        inRange_<InRangeC3<uchar, uchar> >, 0,
+        inRange_<InRangeC3<ushort, ushort> >,
+        inRange_<InRangeC3<short, short> >,
+        inRange_<InRangeC3<int, int> >,
+        inRange_<InRangeC3<float, float> >,
+        inRange_<InRangeC3<double, double> >, 0,
+
+        inRange_<InRangeC4<uchar, uchar> >, 0,
+        inRange_<InRangeC4<ushort, ushort> >,
+        inRange_<InRangeC4<short, short> >,
+        inRange_<InRangeC4<int, int> >,
+        inRange_<InRangeC4<float, float> >,
+        inRange_<InRangeC4<double, double> >, 0
+    };
+
+    CV_Assert( src.size() == lowerb.size() && src.size() == upperb.size() &&
+        src.type() == lowerb.type() && src.type() == upperb.type() &&
+        src.channels() <= 4 );
+
+    InRangeFunc func = tab[src.type()];
+    CV_Assert( func != 0 );
+
+    dst.create(src.size(), CV_8U);
+    func( src, lowerb, upperb, dst );
+}
+
+void inRange(const Mat& src, const Scalar& lowerb,
+             const Scalar& upperb, Mat& dst)
+{
+    static InRangeSFunc tab[] =
+    {
+        inRangeS_<InRangeC1<uchar, int> >, 0,
+        inRangeS_<InRangeC1<ushort, int> >,
+        inRangeS_<InRangeC1<short, int> >,
+        inRangeS_<InRangeC1<int, int> >,
+        inRangeS_<InRangeC1<float, float> >,
+        inRangeS_<InRangeC1<double, double> >, 0,
+
+        inRangeS_<InRangeC2<uchar, int> >, 0,
+        inRangeS_<InRangeC2<ushort, int> >,
+        inRangeS_<InRangeC2<short, int> >,
+        inRangeS_<InRangeC2<int, int> >,
+        inRangeS_<InRangeC2<float, float> >,
+        inRangeS_<InRangeC2<double, double> >, 0,
+
+        inRangeS_<InRangeC3<uchar, int> >, 0,
+        inRangeS_<InRangeC3<ushort, int> >,
+        inRangeS_<InRangeC3<short, int> >,
+        inRangeS_<InRangeC3<int, int> >,
+        inRangeS_<InRangeC3<float, float> >,
+        inRangeS_<InRangeC3<double, double> >, 0,
+
+        inRangeS_<InRangeC4<uchar, int> >, 0,
+        inRangeS_<InRangeC4<ushort, int> >,
+        inRangeS_<InRangeC4<short, int> >,
+        inRangeS_<InRangeC4<int, int> >,
+        inRangeS_<InRangeC4<float, float> >,
+        inRangeS_<InRangeC4<double, double> >, 0
+    };
+
+    CV_Assert( src.channels() <= 4 );
+
+    InRangeSFunc func = tab[src.type()];
+    CV_Assert( func != 0 );
+
+    dst.create(src.size(), CV_8U);
+    func( src, lowerb, upperb, dst );
+}
+
+/****************************************************************************************\
+*                                          compare                                       *
+\****************************************************************************************/
+
+template<typename T, typename WT=T> struct CmpEQ
+{
+    typedef T type1;
+    typedef WT type2;
+    typedef uchar rtype;
+    uchar operator()(T a, WT b) const { return (uchar)-(a == b); }
+};
+
+template<typename T, typename WT=T> struct CmpGT
+{
+    typedef T type1;
+    typedef WT type2;
+    typedef uchar rtype;
+    uchar operator()(T a, WT b) const { return (uchar)-(a > b); }
+};
+
+template<typename T, typename WT=T> struct CmpGE
+{
+    typedef T type1;
+    typedef WT type2;
+    typedef uchar rtype;
+    uchar operator()(T a, WT b) const { return (uchar)-(a >= b); }
+};
+
+void compare( const Mat& src1, const Mat& src2, Mat& dst, int cmpOp )
+{
+    static BinaryFunc tab[][8] =
+    {
+        {binaryOpC1_<CmpGT<uchar>,VCmpGT8u>, 0,
+        binaryOpC1_<CmpGT<ushort>,NoVec>,
+        binaryOpC1_<CmpGT<short>,NoVec>,
+        binaryOpC1_<CmpGT<int>,NoVec>,
+        binaryOpC1_<CmpGT<float>,NoVec>,
+        binaryOpC1_<CmpGT<double>,NoVec>, 0},
+
+        {binaryOpC1_<CmpEQ<uchar>,VCmpEQ8u>, 0,
+        binaryOpC1_<CmpEQ<ushort>,NoVec>,
+        binaryOpC1_<CmpEQ<ushort>,NoVec>, // same function as for ushort's
+        binaryOpC1_<CmpEQ<int>,NoVec>,
+        binaryOpC1_<CmpEQ<float>,NoVec>,
+        binaryOpC1_<CmpEQ<double>,NoVec>, 0},
+    };
+
+    dst.create(src1.rows, src1.cols, CV_8U);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && src1.channels() == 1);
+
+    int depth = src1.depth();
+    const Mat *psrc1 = &src1, *psrc2 = &src2;
+    bool invflag = false;
+
+    switch( cmpOp )
+    {
+    case CMP_GT:
+    case CMP_EQ:
+        break;
+    case CMP_GE:
+        std::swap( psrc1, psrc2 );
+        invflag = true;
+        break;
+    case CMP_LT:
+        std::swap( psrc1, psrc2 );
+        break;
+    case CMP_LE:
+        invflag = true;
+        break;
+    case CMP_NE:
+        cmpOp = CMP_EQ;
+        invflag = true;
+        break;
+    default:
+        CV_Error(CV_StsBadArg, "Unknown comparison method");
+    }
+
+    BinaryFunc func = tab[cmpOp == CMP_EQ][depth];
+    CV_Assert( func != 0 );
+    func( *psrc1, *psrc2, dst );
+    if( invflag )
+        bitwise_not(dst, dst);
+}
+
+
+void compare( const Mat& src1, double value, Mat& dst, int cmpOp )
+{
+    static BinarySFuncC1 tab[][8] =
+    {
+        {binarySOpC1_<CmpEQ<uchar, int> >, 0,
+        binarySOpC1_<CmpEQ<ushort, int> >,
+        binarySOpC1_<CmpEQ<short, int> >,
+        binarySOpC1_<CmpEQ<int> >,
+        binarySOpC1_<CmpEQ<float> >,
+        binarySOpC1_<CmpEQ<double> >, 0},
+
+        {binarySOpC1_<CmpGT<uchar, int> >, 0,
+        binarySOpC1_<CmpGT<ushort, int> >,
+        binarySOpC1_<CmpGT<short, int> >,
+        binarySOpC1_<CmpGT<int> >,
+        binarySOpC1_<CmpGT<float> >,
+        binarySOpC1_<CmpGT<double> >, 0},
+
+        {binarySOpC1_<CmpGE<uchar, int> >, 0,
+        binarySOpC1_<CmpGE<ushort, int> >,
+        binarySOpC1_<CmpGE<short, int> >,
+        binarySOpC1_<CmpGE<int> >,
+        binarySOpC1_<CmpGE<float> >,
+        binarySOpC1_<CmpGE<double> >, 0},
+    };
+
+    dst.create(src1.rows, src1.cols, CV_8U);
+    CV_Assert(src1.channels() == 1);
+    int depth = src1.depth();
+    bool invflag = false;
+
+    switch( cmpOp )
+    {
+    case CMP_GT:
+    case CMP_EQ:
+    case CMP_GE:
+        break;
+    case CMP_LT:
+        invflag = true;
+        cmpOp = CMP_GE;
+        break;
+    case CMP_LE:
+        invflag = true;
+        cmpOp = CMP_GT;
+        break;
+    case CMP_NE:
+        invflag = true;
+        cmpOp = CMP_EQ;
+        break;
+    default:
+        CV_Error(CV_StsBadArg, "Unknown comparison method");
+    }
+
+    BinarySFuncC1 func = tab[cmpOp == CMP_EQ ? 0 : cmpOp == CMP_GT ? 1 : 2][depth];
+    CV_Assert( func != 0 );
+    func( src1, dst, value );
+    if( invflag )
+        bitwise_not(dst, dst);
+}
+
+/****************************************************************************************\
+*                                       min/max                                          *
+\****************************************************************************************/
+
+template<typename T> struct MinOp
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(T a, T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct MaxOp
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(T a, T b) const { return std::max(a, b); }
+};
+
+template<> inline uchar MinOp<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
+template<> inline uchar MaxOp<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
+
+void min( const Mat& src1, const Mat& src2, Mat& dst )
+{
+    static BinaryFunc tab[] =
+    {
+        binaryOpC1_<MinOp<uchar>,VMin8u>, 0, binaryOpC1_<MinOp<ushort>,VMin16u>,
+        binaryOpC1_<MinOp<short>,VMin16s>, binaryOpC1_<MinOp<int>,NoVec>,
+        binaryOpC1_<MinOp<float>,VMin32f>, binaryOpC1_<MinOp<double>,NoVec>, 0
+    };
+
+    BinaryFunc func = tab[src1.depth()];
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
+    dst.create(src1.size(), src1.type());
+
+    return func( src1, src2, dst );
+}
+
+void max( const Mat& src1, const Mat& src2, Mat& dst )
+{
+    static BinaryFunc tab[] =
+    {
+        binaryOpC1_<MaxOp<uchar>,VMax8u>, 0, binaryOpC1_<MaxOp<ushort>,VMax16u>,
+        binaryOpC1_<MaxOp<short>,VMax16s>, binaryOpC1_<MaxOp<int>,NoVec>,
+        binaryOpC1_<MaxOp<float>,VMax32f>, binaryOpC1_<MaxOp<double>,NoVec>, 0
+    };
+
+    BinaryFunc func = tab[src1.depth()];
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
+    dst.create(src1.size(), src1.type());
+
+    return func( src1, src2, dst );
+}
+
+void min( const Mat& src1, double value, Mat& dst )
+{
+    static BinarySFuncC1 tab[] =
+    {
+        binarySOpC1_<MinOp<uchar> >, 0,
+        binarySOpC1_<MinOp<ushort> >,
+        binarySOpC1_<MinOp<short> >,
+        binarySOpC1_<MinOp<int> >,
+        binarySOpC1_<MinOp<float> >,
+        binarySOpC1_<MinOp<double> >, 0
+    };
+
+    BinarySFuncC1 func = tab[src1.depth()];
+    CV_Assert(func != 0);
+    dst.create(src1.size(), src1.type());
+    return func( src1, dst, value );
+}
+
+void max( const Mat& src1, double value, Mat& dst )
+{
+    static BinarySFuncC1 tab[] =
+    {
+        binarySOpC1_<MaxOp<uchar> >, 0,
+        binarySOpC1_<MaxOp<ushort> >,
+        binarySOpC1_<MaxOp<short> >,
+        binarySOpC1_<MaxOp<int> >,
+        binarySOpC1_<MaxOp<float> >,
+        binarySOpC1_<MaxOp<double> >, 0
+    };
+
+    BinarySFuncC1 func = tab[src1.depth()];
+    CV_Assert(func != 0);
+    dst.create(src1.size(), src1.type());
+    return func( src1, dst, value );
+}
+
+}
+
+/****************************************************************************************\
+*                                Earlier API: cvAdd etc.                                 *
+\****************************************************************************************/
+
+CV_IMPL void
+cvNot( const CvArr* srcarr, CvArr* dstarr )
+{
+    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+    cv::bitwise_not( src, dst );
+}
+
+
+CV_IMPL void
+cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_and( src1, src2, dst, mask );
+}
+
+CV_IMPL void
+cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_or( src1, src2, dst, mask );
+}
+
+
+CV_IMPL void
+cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_xor( src1, src2, dst, mask );
+}
+
+
+CV_IMPL void
+cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_and( src, s, dst, mask );
+}
+
+
+CV_IMPL void
+cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_or( src, s, dst, mask );
+}
+
+
+CV_IMPL void
+cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::bitwise_xor( src, s, dst, mask );
+}
+
+CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::add( src1, src2, dst, mask );
+}
+
+CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::subtract( src1, src2, dst, mask );
+}
+
+CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::add( src1, value, dst, mask );
+}
+
+CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    if( maskarr )
+        mask = cv::cvarrToMat(maskarr);
+    cv::subtract( value, src1, dst, mask );
+}
+
+CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
+                    CvArr* dstarr, double scale )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    cv::multiply( src1, src2, dst, scale );
+}
+
+CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
+                    CvArr* dstarr, double scale )
+{
+    cv::Mat src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr), mask;
+    CV_Assert( src2.size() == dst.size() && src2.type() == dst.type() );
+
+    if( srcarr1 )
+        cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale );
+    else
+        cv::divide( scale, src2, dst );
+}
+
+
+CV_IMPL void
+cvAddWeighted( const CvArr* srcarr1, double alpha,
+               const CvArr* srcarr2, double beta,
+               double gamma, CvArr* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
+        dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+    cv::addWeighted( src1, alpha, src2, beta, gamma, dst );
+}
+
+
+CV_IMPL  void
+cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
+}
+
+
+CV_IMPL void
+cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::absdiff( src1, scalar, dst );
+}
+
+CV_IMPL void
+cvInRange( const void* srcarr1, const void* srcarr2,
+           const void* srcarr3, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );
+
+    cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
+}
+
+CV_IMPL void
+cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );
+
+    cv::inRange( src1, lowerb, upperb, dst );
+}
+
+
+CV_IMPL void
+cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );
+
+    cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
+}
+
+
+CV_IMPL void
+cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );
+
+    cv::compare( src1, value, dst, cmp_op );
+}
+
+
+CV_IMPL void
+cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::min( src1, cv::cvarrToMat(srcarr2), dst );
+}
+
+
+CV_IMPL void
+cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::max( src1, cv::cvarrToMat(srcarr2), dst );
+}
+
+CV_IMPL void
+cvMinS( const void* srcarr1, double value, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::min( src1, value, dst );
+}
+
+
+CV_IMPL void
+cvMaxS( const void* srcarr1, double value, void* dstarr )
+{
+    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
+    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
+
+    cv::max( src1, value, dst );
+}
+
+
+/* End of file. */