您的位置:首页 > 移动开发 > Android开发

在android开发板上测试neon加速实验

2017-11-04 15:29 197 查看
今天为了测试在mtk6757上面的neon加速性能,从网上找来一个测试例子保留了它的汇编文件和cpp文件,做了一些简单的修改,然后自己写了一个Android.mk和Application.mk

采用ndk来进行编译。

文件提取链接  http://pan.baidu.com/s/1o8KF8tO

下面贴出相关文件的结构和代码

archermind@flm:~$ tree work >tree.log

archermind@flm:~$ cat tree.log 

work

└── jni

    ├── Android.mk

    ├── Application.mk

    ├── cost_cal.s

    ├── cost_final.s

    ├── cost_init.s

    ├── l_cost.s

    ├── sgbm.cpp

    ├── test.cpp

    └── x_cost.s

1 directory, 9 files

archermind@flm:~$

archermind@flm:~/work/jni$ cat Android.mk

LOCAL_PATH:= $(call my-dir)  

include $(CLEAR_VARS)  

NDK_APP_DST_DIR := $(LOCAL_PATH)

include /home/archermind/OpenCV-android-sdk/sdk/native/jni/OpenCV.mk 

LOCAL_SRC_FILES := cost_cal.s  cost_final.s  cost_init.s  l_cost.s  x_cost.s test.cpp sgbm.cpp

LOCAL_CFLAGS := -D__cpusplus -g -mfloat-abi=soft -mfpu=neon -march=armv7-a -mtune=cortex-a53

TARGET_ARCH_ABI :=armeabi-v7a 

LOCAL_ARM_MODE := arm

ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)

LOCAL_ARM_NEON := true 

endif   

LOCAL_MODULE := test

include $(BUILD_EXECUTABLE)

archermind@flm:~/work/jni$ 

archermind@flm:~/work/jni$ cat Application.mk

APP_STL := gnustl_static

APP_CPPFLAGS := -frtti -fexceptions -std=c++11 

APP_ABI := armeabi-v7a

APP_PLATFORM := android-19

archermind@flm:~/work/jni$ 

archermind@flm:~/work/jni$ cat cost_cal.s
.arm
.text
.global cost_cal

cost_cal:
push {r4-r6}
ldr r12,[sp,#12]@hsumSub
ldr r4,[sp,#16]@C
ldr r5,[sp,#20]@Count
ldr r6,[sp,#24]@hsumAdd[x+d]
lsr r5,r5,#3@Count/8

.loop:
vld1.16
{q0},[r0]! @pixAdd[d]
vld1.16
{q1},[r1]! @pixSub[d]
vld1.16
{q2},[r2]! @hsumAdd[x-D+d]
vqadd.s16 q3,q0,q2
vqsub.s16 q3,q3,q1@hsumAdd[x+d]
vst1.16
{q3},[r6]! @hsumAdd[x+d]
vld1.16
{q8},[r3]! @Cprev[x+d]
vld1.16
{q9},[r12]! @hsumSub[x+d]
vqadd.s16 q3,q3,q8
vqsub.s16 q3,q3,q9
vst1.16
{q3},[r4]!
subs r5,r5,#1
bne .loop
pop           {r4-r6}
bx            lr
.end

archermind@flm:~/work/jni$ 

archermind@flm:~/work/jni$ cat cost_final.s
.arm
.text
.global cost_final

cost_final:

vdup.8
q0,r0 @u
lsr r3,r3,#4@D/16

.loop:
vld1.8
{q1},[r1]! @v
vabd.s8
q2,q0,q1 @abs(u-v)
vmovl.s8
q3,d4
vmovl.s8
q8,d5
vld1.16
{q1},[r2]
vqadd.s16
q1,q1,q3
vst1.16
{q1},[r2]!
vld1.16
{q2},[r2]
vqadd.s16
q2,q2,q8
vst1.16
{q2},[r2]!
subs r3,r3,#1
bne .loop

bx            lr
.end

archermind@flm:~/work/jni$ cat cost_init.s
.arm
.text
.global cost_init

cost_init:
push {r4-r7}
ldr r12,[sp,#16]@v
ldr r4,[sp,#20]@v0
ldr r5,[sp,#24]@v1
ldr r6,[sp,#28]@cost
ldr r7,[sp,#32]@D
lsr r7,r7,#4 
vdup.8
q0,r0 @u
vdup.8
q1,r1 @u0
vdup.8
q2,r2 @u1
vdup.16
q12,r3 @(-1)*diff_scale

.loop:
vld1.8
{q3},[r12]! @v
vld1.8
{q8},[r4]! @v0
vld1.8
{q9},[r5]! @v1
vqsub.s8
q9,q0,q9 @u-v1
vqsub.s8
q8,q8,q0 @v0-u
vmax.s8
q8,q8,q9 @c0=max(u-v1,v0-u)
vqsub.s8
q10,q3,q2 @v-u1
vqsub.s8
q11,q1,q3 @u0-v
vmax.s8
q10,q10,q11 @c1=max(v-u1,u0-v)
vmin.s8
q3,q8,q10 @min(c0,c1)
vmovl.s8
q10,d6
vmovl.s8
q11,d7
vshl.s16
q10,q10,q12
vshl.s16
q11,q11,q12
vld1.16
{q3},[r6]
vqadd.s16
q10,q10,q3
vst1.16
{q10},[r6]!
vld1.16
{q3},[r6]
vqadd.s16
q11,q11,q3
vst1.16
{q11},[r6]!
subs r7,r7,#1
bne .loop

pop           {r4-r7}
bx            lr
.end

archermind@flm:~/work/jni$ cat l_cost.s

    .arm
.text
.global l_cost

l_cost:

        push     {r4-r9}
ldr r12,[sp,#24]  @lr_p0
ldr     r4,[sp,#28]   @lr_p1
ldr     r5,[sp,#32]   @Cp
ldr     r6,[sp,#36]   @Sp
ldr     r7,[sp,#40]   @minLr[0][xm]
ldr r8,[sp,#44] @D

vpush           {q4}

vdup.16       q0,r0       @delta0
vdup.16       q1,r1      @delta1
vdup.16       q2,r2      @delta2
vdup.16       q3,r3      @delta3
mov           r0,#200     
vdup.16       q4,r0       @p1
mov         r0,#255
lsl    r0,r0,#7
add         r0,r0,#127
vdup.16
q8,r0 @minL0
vdup.16
q9,r0 @minL1
vdup.16
q10,r0 @minL2
vdup.16
q11,r0 @minL3
add r1,r8,#16@D2
mov r9,#22
mul r9,r9,r1
sub r9,r9,#16@22*D2-16
lsl r2,r1,#3@NRD2
sub r3,r2,#1
lsl r3,r3,#1@2*(NRD2-1)
lsl r1,r1,#1@2*D2
add r0,r1,r3@2*(NRD2-1+D2)
lsl r2,r0,#1@4*(NRD2-1+D2)
sub r2,r2,#10@4*NRD2+4*D2-14
lsr r8,r8,#3@D/8

.loop:
vld1.16       {q12},[r5]!    @Cp[d]
vld1.16
{q13},[r6] @Sp[d] 

vld1.16   {q14},[r12]   @lr_p0[d]
sub          r12,r12,#2     
vld1.16      {q15},[r12]   @lr_p0[d-1]
vqadd.s16     q15,q15,q4   @lr_p0[d-1]+p1
vmin.s16
q14,q14,q15 @min(lr_p0[d],lr_p0[d-1]+p1)
add          r12,r12,#4    
vld1.16      {q15},[r12]  @lr_p0[d+1]
vqadd.s16
q15,q15,q4 @lr_p0[d+1]+p1
vmin.s16
q14,q14,q15 @min(lr_p0[d],lr_p0[d-1]+p1,lr_p0[d+1],p1)
vmin.s16      q14,q14,q0
vqsub.s16     q14,q14,q0
vqadd.s16     q14,q14,q12  @L0
add r12,r12,r3@Lr_p[d]
vst1.16
{q14},[r12] @Lr_p[d]
vmin.s16
q8,q8,q14 @min(minL0,L0)
vqadd.s16
q13,q13,q14 @Sp[d]+L0

vld1.16       {q14},[r4] @lr_p1[d]
sub           r4,r4,#2  
vld1.16       {q15},[r4]  @lr_p1[d-1]
vqadd.s16
 q15,q15,q4 @lr_p1[d-1]+p1
vmin.s16
q14,q14,q15 @min(lr_p1[d],lr_p1[d-1]+p1)
add           r4,r4,#4
vld1.16       {q15},[r4]  @lr_p1[d+1]
vqadd.s16     q15,q15,q4   @lr_p1[d+1]+p1
vmin.s16      q14,q14,q15 @min(lr_p1[d],lr_p1[d-1]+p1,lr_p1[d+1]+p1)
vmin.s16      q14,q14,q1 
vqsub.s16     q14,q14,q1
vqadd.s16     q14,q14,q12 @L1
add r12,r12,r1
vst1.16
{q14},[r12] @Lr_p[d+D2]
vmin.s16
q9,q9,q14 @min(minL1,L1)
vqadd.s16
q13,q13,q14 @Sp[d]+L0+L1

add r4,r4,r0  
vld1.16       {q14},[r4] @lr_p2[d]
sub           r4,r4,#2
vld1.16       {q15},[r4]  @lr_p2[d-1]
vqadd.s16
q15,q15,q4 @lr_p2[d-1]+p1
vmin.s16
q14,q14,q15 @min(lr_p2[d],lr_p2[d-1]+p1)
add           r4,r4,#4
vld1.16       {q15},[r4]  @lr_p2[d+1]
vqadd.s16     q15,q15,q4   @lr_p2[d+1]+p1
vmin.s16      q14,q14,q15  @min(lr_p2[d],lr_p2[d-1]+p1,lr_p2[d+1]+p1)
vmin.s16      q14,q14,q2
vqsub.s16     q14,q14,q2
vqadd.s16     q14,q14,q12 @L2
add r12,r12,r1
vst1.s16
{q14},[r12] @Lr_p[d+2*D2]
vmin.s16
q10,q10,q14 @min(minL2,L2)
vqadd.s16
q13,q13,q14 @Sp[d]+L0+L1+L2

add r4,r4,r0 
vld1.16       {q14},[r4] @lr_p3[d]
sub           r4,r4,#2
vld1.16       {q15},[r4]  @lr_p3[d-1]
vqadd.s16
q15,q15,q4 @lr_p3[d-1]+p1
vmin.s16
q14,q14,q15 @min(lr_p3[d],lr_p3[d-1]+p1)
add           r4,r4,#4
vld1.16       {q15},[r4]  @lr_p3[d+1]
vqadd.s16     q15,q15,q4   @lr_p3[d+1]+p1
vmin.s16      q14,q14,q15  @min(lr_p3[d],lr_p3[d-1]+p1,lr_p3[d+1]+p1)
vmin.s16      q14,q14,q3
vqsub.s16     q14,q14,q3
vqadd.s16     q14,q14,q12 @L3
add r12,r12,r1
vst1.16
{q14},[r12] @Lr_p[d+3*D2]
vmin.s16
q11,q11,q14 @min(minL3,L3)
vqadd.s16
q13,q13,q14 @Sp[d]+L0+L1+L2+L3
vst1.16
{q13},[r6]! @Sp[d]
sub r12,r12,r9@Lr_p[d+3*D2]->Lr_p0[d+8]
sub r4,r4,r2@Lr_p3[d+1]->Lr_p1[d+8]
subs r8,r8,#1
bne .loop

vtrn.16       q8,q9        @L0,L1
vtrn.16       q10,q11        @L2,L3
vmin.s16      q8,q8,q9     
vmin.s16      q10,q10,q11
vmin.s16      d0,d16,d17
vmin.s16      d1,d20,d21
vshr.s64      q1,q0,#32
vmin.s16
q0,q0,q1
vtrn.32
d0,d1 @minL0,minL1,minL2,minL3
vst1.16
{q0},[r7]

vpop           {q4}
pop           {r4-r9}
bx            lr
.end

archermind@flm:~/work/jni$ 

archermind@flm:~/work/jni$ cat sgbm.cpp

#include "opencv2/core/core.hpp"

#include "opencv2/highgui/highgui.hpp"

#include "opencv2/imgproc/imgproc.hpp"

#include <iostream>

using namespace cv;

using namespace std;

typedef uchar PixType;

typedef short CostType;

typedef short DispType;

typedef cv::Point_<short> Point2s;

enum { NR = 16, NR2 = NR / 2 };

extern "C" void x_cost(short a,short* b,short* c,short* d,short*e,short*f,int g,short*h); 

extern "C" void cost_cal(short* a,short* b,short* c,const short* d,const short* e,short *f,int g,short* h);

extern "C" void l_cost(short a,short b,short c,short d,short* e,short* f,const short* g,short* h,short* i,int j);

extern "C" void cost_init(int a,int b,int c,int d,PixType *e,PixType*f,PixType*g,short*h,int i);

extern "C" void cost_final(int a,PixType * b,short* c,int d);

struct StereoSGBMParams

{
StereoSGBMParams()
{
minDisparity = numDisparities = 0;
SADWindowSize = 0;
P1 = P2 = 0;
disp12MaxDiff = 0;
preFilterCap = 0;
uniquenessRatio = 0;
speckleWindowSize = 0;
speckleRange = 0;
mode = 0;
}

StereoSGBMParams(int _minDisparity, int _numDisparities, int _SADWindowSize,
int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
int _mode)
{
minDisparity = _minDisparity;
numDisparities = _numDisparities;
SADWindowSize = _SADWindowSize;
P1 = _P1;
P2 = _P2;
disp12MaxDiff = _disp12MaxDiff;
preFilterCap = _preFilterCap;
uniquenessRatio = _uniquenessRatio;
speckleWindowSize = _speckleWindowSize;
speckleRange = _speckleRange;
mode = _mode;
}

int minDisparity;
int numDisparities;
int SADWindowSize;
int preFilterCap;
int uniquenessRatio;
int P1;
int P2;
int speckleWindowSize;
int speckleRange;
int disp12MaxDiff;
int mode;

};

template <typename T>

void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)

{
using namespace cv;

int width = img.cols, height = img.rows, npixels = width*height;
size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
if (!_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize)
_buf.create(1, (int)bufSize, CV_8U);

uchar* buf = _buf.ptr();
int i, j, dstep = (int)(img.step / sizeof(T));
int* labels = (int*)buf;
buf += npixels*sizeof(labels[0]);
Point2s* wbuf = (Point2s*)buf;
buf += npixels*sizeof(wbuf[0]);
uchar* rtype = (uchar*)buf;
int curlabel = 0;

// clear out label assignments
memset(labels, 0, npixels*sizeof(labels[0]));

for (i = 0; i < height; i++)
{
T* ds = img.ptr<T>(i);
int* ls = labels + width*i;

for (j = 0; j < width; j++)
{
if (ds[j] != newVal)   // not a bad disparity
{
if (ls[j])     // has a label, check for bad label
{
if (rtype[ls[j]]) // small region, zero out disparity
ds[j] = (T)newVal;
}
// no label, assign and propagate
else
{
Point2s* ws = wbuf; // initialize wavefront
Point2s p((short)j, (short)i);  // current pixel
curlabel++; // next label
int count = 0;  // current region size
ls[j] = curlabel;

// wavefront propagation
while (ws >= wbuf) // wavefront not empty
{
count++;
// put neighbors onto wavefront
T* dpp = &img.at<T>(p.y, p.x);
T dp = *dpp;
int* lpp = labels + width*p.y + p.x;

if (p.y < height - 1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff)
{
lpp[+width] = curlabel;
*ws++ = Point2s(p.x, p.y + 1);
}

if (p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff)
{
lpp[-width] = curlabel;
*ws++ = Point2s(p.x, p.y - 1);
}

if (p.x < width - 1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff)
{
lpp[+1] = curlabel;
*ws++ = Point2s(p.x + 1, p.y);
}

if (p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff)
{
lpp[-1] = curlabel;
*ws++ = Point2s(p.x - 1, p.y);
}

// pop most recent and propagate
// NB: could try least recent, maybe better convergence
p = *--ws;
}

// assign label type
if (count <= maxSpeckleSize)   // speckle region
{
rtype[ls[j]] = 1;   // small region label
ds[j] = (T)newVal;
}
else
rtype[ls[j]] = 0;   // large region label
}
}
}
}

}

void filterSpeckles(InputOutputArray _img, double _newval, int maxSpeckleSize,
double _maxDiff, InputOutputArray __buf)

{
Mat img = _img.getMat();
int type = img.type();
Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
CV_Assert(type == CV_8UC1 || type == CV_16SC1);

int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);

#if IPP_VERSION_X100 >= 801
CV_IPP_CHECK()
{
Ipp32s bufsize = 0;
IppiSize roisize = { img.cols, img.rows };
IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;

if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))
{
IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);
Ipp8u * buffer = ippsMalloc_8u(bufsize);

if ((int)status >= 0)
{
if (type == CV_8UC1)
status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
(Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);
else
status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
(Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);
}

if (status >= 0)
{
CV_IMPL_ADD(CV_IMPL_IPP);
return;
}
setIppErrorStatus();
}
}

#endif

if (type == CV_8UC1)
filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
else
filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);

}

static void calcPixelCostBT(const Mat& img1, const Mat& img2, int y,
int minD, int maxD, CostType* cost,
PixType* buffer, const PixType* tab,
int tabOfs, int)

{
int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
PixType *prow1 = buffer + width2 * 2, *prow2 = prow1 + width*cn * 2;

tab += tabOfs;

for (c = 0; c < cn * 2; c++)
{
prow1[width*c] = prow1[width*c + width - 1] =
prow2[width*c] = prow2[width*c + width - 1] = tab[0];
}

int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows - 1 ? (int)img1.step : 0;
int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows - 1 ? (int)img2.step : 0;

if (cn == 1)
{
for (x = 1; x < width - 1; x++)
{
prow1[x] = tab[(row1[x + 1] - row1[x - 1]) * 2 + row1[x + n1 + 1] - row1[x + n1 - 1] + row1[x + s1 + 1] - row1[x + s1 - 1]];
prow2[width - 1 - x] = tab[(row2[x + 1] - row2[x - 1]) * 2 + row2[x + n2 + 1] - row2[x + n2 - 1] + row2[x + s2 + 1] - row2[x + s2 - 1]];

prow1[x + width] = row1[x];
prow2[width - 1 - x + width] = row2[x];
}
}
else
{
for (x = 1; x < width - 1; x++)
{
prow1[x] = tab[(row1[x * 3 + 3] - row1[x * 3 - 3]) * 2 + row1[x * 3 + n1 + 3] - row1[x * 3 + n1 - 3] + row1[x * 3 + s1 + 3] - row1[x * 3 + s1 - 3]];
prow1[x + width] = tab[(row1[x * 3 + 4] - row1[x * 3 - 2]) * 2 + row1[x * 3 + n1 + 4] - row1[x * 3 + n1 - 2] + row1[x * 3 + s1 + 4] - row1[x * 3 + s1 - 2]];
prow1[x + width * 2] = tab[(row1[x * 3 + 5] - row1[x * 3 - 1]) * 2 + row1[x * 3 + n1 + 5] - row1[x * 3 + n1 - 1] + row1[x * 3 + s1 + 5] - row1[x * 3 + s1 - 1]];

prow2[width - 1 - x] = tab[(row2[x * 3 + 3] - row2[x * 3 - 3]) * 2 + row2[x * 3 + n2 + 3] - row2[x * 3 + n2 - 3] + row2[x * 3 + s2 + 3] - row2[x * 3 + s2 - 3]];
prow2[width - 1 - x + width] = tab[(row2[x * 3 + 4] - row2[x * 3 - 2]) * 2 + row2[x * 3 + n2 + 4] - row2[x * 3 + n2 - 2] + row2[x * 3 + s2 + 4] - row2[x * 3 + s2 - 2]];
prow2[width - 1 - x + width * 2] = tab[(row2[x * 3 + 5] - row2[x * 3 - 1]) * 2 + row2[x * 3 + n2 + 5] - row2[x * 3 + n2 - 1] + row2[x * 3 + s2 + 5] - row2[x * 3 + s2 - 1]];

prow1[x + width * 3] = row1[x * 3];
prow1[x + width * 4] = row1[x * 3 + 1];
prow1[x + width * 5] = row1[x * 3 + 2];

prow2[width - 1 - x + width * 3] = row2[x * 3];
prow2[width - 1 - x + width * 4] = row2[x * 3 + 1];
prow2[width - 1 - x + width * 5] = row2[x * 3 + 2];
}
}

memset(cost, 0, width1*D*sizeof(cost[0]));

buffer -= minX2;
cost -= minX1*D + minD; // simplify the cost indices inside the loop

#if 1
for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)
{
int diff_scale = c < cn ? 0 : 2;

// precompute
//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
for (x = minX2; x < maxX2; x++)
{
int v = prow2[x];
int vl = x > 0 ? (v + prow2[x - 1]) / 2 : v;
int vr = x < width - 1 ? (v + prow2[x + 1]) / 2 : v;
int v0 = std::min(vl, vr); v0 = std::min(v0, v);
int v1 = std::max(vl, vr); v1 = std::max(v1, v);
buffer[x] = (PixType)v0;
buffer[x + width2] = (PixType)v1;
}

for (x = minX1; x < maxX1; x++)
{
int u = prow1[x];
int ul = x > 0 ? (u + prow1[x - 1]) / 2 : u;
int ur = x < width - 1 ? (u + prow1[x + 1]) / 2 : u;
int u0 = std::min(ul, ur); u0 = std::min(u0, u);
int u1 = std::max(ul, ur); u1 = std::max(u1, u);
//assembly 
cost_init(u,u0,u1,-diff_scale,prow2+width - x - 1 + minD,buffer+width - x - 1 + minD,buffer+width - x - 1 + minD+width2,cost+x*D+minD,D);
/*{
for (int d = minD; d < maxD; d++)
{
int v = prow2[width - x - 1 + d];
int v0 = buffer[width - x - 1 + d];
int v1 = buffer[width - x - 1 + d + width2];
int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);

cost[x*D + d] = (CostType)(cost[x*D + d] + (std::min(c0, c1) >> diff_scale));
}
}*/
}
}

#else
for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)
{
for (x = minX1; x < maxX1; x++)
{
int u = prow1[x];
//assembly
cost_final(u,prow2+width - 1 - x +minD ,cost+x*D+minD,D);
/*{
for (int d = minD; d < maxD; d++)
{
int v = prow2[width - 1 - x + d];
cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
}
}*/
}
}

#endif

}

static void computeDisparitySGBM(const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params,
Mat& buffer)

{
const int ALIGN = 16;
const int DISP_SHIFT = 4;
const int DISP_SCALE = (1 << DISP_SHIFT);
const CostType MAX_COST = SHRT_MAX;

int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1 + 1);
int k, width = disp1.cols, height = disp1.rows;
int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
int D = maxD - minD, width1 = maxX1 - minX1;
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = SADWindowSize.width / 2, SH2 = SADWindowSize.height / 2;
bool fullDP = params.mode == 1;
int npasses = fullDP ? 2 : 1;
const int TAB_OFS = 256 * 4, TAB_SIZE = 256 + TAB_OFS * 2;
PixType clipTab[TAB_SIZE];

short *minvalue=new short[12];
//for the use of x_cost.s
memset(minvalue,0,sizeof(short)*12);
short d8[8]={0,1,2,3,4,5,6,7};  

for (k = 0; k < TAB_SIZE; k++)
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);

if (minX1 >= maxX1)
{
disp1 = Scalar::all(INVALID_DISP_SCALED);
return;
}

CV_Assert(D % 16 == 0);

// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
// if you change NR, please, modify the loop as well.
int D2 = D + 16, NRD2 = NR2*D2;

// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
const int NLR = 2;
const int LrBorder = NLR - 1;

// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
size_t costBufSize = width1*D;
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
size_t minLrSize = (width1 + LrBorder * 2)*NR2, LrSize = minLrSize*D2;
int hsumBufNRows = SH2 * 2 + 2;
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
CSBufSize * 2 * sizeof(CostType) + // C, S
width * 16 * img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2

if (buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize)
buffer.create(1, (int)totalBufSize, CV_8U);

// summary cost over different (nDirs) directions
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
CostType* Sbuf = Cbuf + CSBufSize;
CostType* hsumBuf = Sbuf + CSBufSize;
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;

CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
DispType* disp2ptr = (DispType*)(disp2cost + width);
PixType* tempBuf = (PixType*)(disp2ptr + width);

// add P2 to every C(x,y). it saves a few operations in the inner loops
for (k = 0; k < width1*D; k++)
Cbuf[k] = (CostType)P2;

for (int pass = 1; pass <= npasses; pass++)
{
int x1, y1, x2, y2, dx, dy;

if (pass == 1)
{
y1 = 0; y2 = height; dy = 1;
x1 = 0; x2 = width1; dx = 1;
}
else
{
y1 = height - 1; y2 = -1; dy = -1;
x1 = width1 - 1; x2 = -1; dx = -1;
}

CostType *Lr[NLR] = { 0 }, *minLr[NLR] = { 0 };

for (k = 0; k < NLR; k++)
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
memset(Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType));
minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
memset(minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType));
}

for (int y = y1; y != y2; y += dy)
{
int x, d;
DispType* disp1ptr = disp1.ptr<DispType>(y);
CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);

if (pass == 1) // compute C on the first pass, and reuse it on the second pass, if any.
{
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;

for (k = dy1; k <= dy2; k++)
{
CostType* hsumAdd = hsumBuf + (std::min(k, height - 1) % hsumBufNRows)*costBufSize;

if (k < height)
{
calcPixelCostBT(img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero);

memset(hsumAdd, 0, D*sizeof(CostType));
for (x = 0; x <= SW2*D; x += D)
{
int scale = x == 0 ? SW2 + 1 : 1;
for (d = 0; d < D; d++)
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d] * scale);
}

if (y > 0)
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
for (x=D;x<(SW2+1)*D;x+=D)
{
const CostType* pixAdd = pixDiff + x + SW2*D;
const CostType* pixSub = pixDiff ;  
for( d = 0; d < D; d++ )
{
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
}
}
//assembly
cost_cal(pixDiff + (2*SW2+1)*D,pixDiff ,hsumAdd+SW2*D,Cprev+(SW2+1)*D,hsumSub+(SW2+1)*D,C+(SW2+1)*D,(width1-1-2*SW2)*D,hsumAdd+(SW2+1)*D);
/*for(x=(SW2+1)*D;x<=(width1-1-SW2)*D;x+=D)
{
const CostType* pixAdd = pixDiff + x + SW2*D;
const CostType* pixSub = pixDiff + x - (SW2+1)*D;
for( d = 0; d < D; d++ )
{
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
}
}*/
for(x=(width1-SW2)*D;x<width1*D;x+=D)
{
const CostType* pixAdd = pixDiff + (width1-1)*D;
const CostType* pixSub = pixDiff + x - (SW2+1)*D;
for( d = 0; d < D; d++ )
{
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
}
}

}
else
{
for (x = D; x < width1*D; x += D)
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D);
const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0);

for (d = 0; d < D; d++)
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
}
}
}

if (y == 0)
{
int scale = k == 0 ? SH2 + 1 : 1;
for (x = 0; x < width1*D; x++)
C[x] = (CostType)(C[x] + hsumAdd[x] * scale);
}
}

// also, clear the S buffer
for (k = 0; k < width1*D; k++)
S[k] = 0;
}

// clear the left and the right borders
memset(Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType));
memset(Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType));
memset(minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType));
memset(minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType));

/*
[formula 13 in the paper]
compute L_r(p, d) = C(p, d) +
min(L_r(p-r, d),
L_r(p-r, d-1) + P1,
L_r(p-r, d+1) + P1,
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
where p = (x,y), r is one of the directions.
we process all the directions at once:
0: r=(-dx, 0)
1: r=(-1, -dy)
2: r=(0, -dy)
3: r=(1, -dy)
4: r=(-2, -dy)
5: r=(-1, -dy*2)
6: r=(1, -dy*2)
7: r=(2, -dy)
*/
for (x = x1; x != x2; x += dx)
{
int xm = x*NR2, xd = xm*D2;

int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;

CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
CostType* Lr_p2 = Lr[1] + xd + D2 * 2;
CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2 * 3;

Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;

//CostType* Lr_p = Lr[0] + xd;
const CostType* Cp = C + x*D;
CostType* Sp = S + x*D;
//assembly
l_cost((short)delta0,(short)delta1,(short)delta2,(short)delta3,Lr_p0,Lr_p1,Cp,Sp,minLr[0]+xm,D);
/*int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
for (d = 0; d < D; d++)
{
int Cpd = Cp[d], L0, L1, L2, L3;

L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1;
L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2;
L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3;

Lr_p[d] = (CostType)L0;
minL0 = std::min(minL0, L0);

Lr_p[d + D2] = (CostType)L1;
minL1 = std::min(minL1, L1);

Lr_p[d + D2 * 2] = (CostType)L2;
minL2 = std::min(minL2, L2);

Lr_p[d + D2 * 3] = (CostType)L3;
minL3 = std::min(minL3, L3);

Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
}
minLr[0][xm] = (CostType)minL0;
minLr[0][xm + 1] = (CostType)minL1;
minLr[0][xm + 2] = (CostType)minL2;
minLr[0][xm + 3] = (CostType)minL3;*/

}

if (pass == npasses)
{
for (x = 0; x < width; x++)
{
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
disp2cost[x] = MAX_COST;
}

for (x = width1 - 1; x >= 0; x--)
{
CostType* Sp = S + x*D;
int minS = MAX_COST, bestDisp = -1;

if (npasses == 1)
{
int xm = x*NR2, xd = xm*D2;
//int minL0 = MAX_COST;
int delta0 = minLr[0][xm + NR2] + P2;
CostType* Lr_p0 = Lr[0] + xd + NRD2;
Lr_p0[-1] = Lr_p0[D] = MAX_COST;
//CostType* Lr_p = Lr[0] + xd;
CostType* Cp = C + x*D;
//assembly
x_cost((short)delta0,Cp,Lr_p0,Lr[0]+xd,Sp,d8,D,minvalue);
minLr[0][xm]=minvalue[0];
minS=minvalue[1];
int i=4;
 
while(minvalue[i]==0&&i<11)
{
i++;
}
bestDisp=minvalue[i];
/*for (d = 0; d < D; d++)
{
int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
Lr_p[d] = (CostType)L0;
minL0 = std::min(minL0, L0);
int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
if (Sval < minS)
{
minS = Sval;
bestDisp = d;
}
}
minLr[0][xm] = (CostType)minL0;*/

}
else
{
for (d = 0; d < D; d++)
{
int Sval = Sp[d];
if (Sval < minS)
{
minS = Sval;
bestDisp = d;
}
}
}

for (d = 0; d < D; d++)
{
if (Sp[d] * (100 - uniquenessRatio) < minS * 100 && std::abs(bestDisp - d) > 1)
break;
}
if (d < D)
continue;
d = bestDisp;
int _x2 = x + minX1 - d - minD;
if (disp2cost[_x2] > minS)
{
disp2cost[_x2] = (CostType)minS;
disp2ptr[_x2] = (DispType)(d + minD);
}

if (0 < d && d < D - 1)
{
// do subpixel quadratic interpolation:
//   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
//   then find minimum of the parabola.
int denom2 = std::max(Sp[d - 1] + Sp[d + 1] - 2 * Sp[d], 1);
d = d*DISP_SCALE + ((Sp[d - 1] - Sp[d + 1])*DISP_SCALE + denom2) / (denom2 * 2);
}
else
d *= DISP_SCALE;
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
}

for (x = minX1; x < maxX1; x++)
{
// we round the computed disparity both towards -inf and +inf and check
// if either of the corresponding disparities in disp2 is consistent.
// This is to give the computed disparity a chance to look valid if it is.
int d1 = disp1ptr[x];
if (d1 == INVALID_DISP_SCALED)
continue;
int _d = d1 >> DISP_SHIFT;
int d_ = (d1 + DISP_SCALE - 1) >> DISP_SHIFT;
int _x = x - _d, x_ = x - d_;
if (0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff)
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
}
}

// now shift the cyclic buffers
std::swap(Lr[0], Lr[1]);
std::swap(minLr[0], minLr[1]);
}
}

}

void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr,int mindis,int numdis)

{
Mat left = leftarr.getMat(), right = rightarr.getMat();
CV_Assert(left.size() == right.size() && left.type() == right.type() &&
left.depth() == CV_8U);

disparr.create(left.size(), CV_16S);
Mat disp = disparr.getMat();
StereoSGBMParams params(mindis, numdis, 5, 200, 800, 10, 4, 1, 150, 2, 0);
Mat buffer;
computeDisparitySGBM(left, right, disp, params, buffer);
medianBlur(disp, disp, 3);

if (params.speckleWindowSize > 0)
filterSpeckles(disp, (params.minDisparity - 1) * 16, params.speckleWindowSize,
16 * params.speckleRange, buffer);

}

struct Data{
Mat img1,img2,disp;
int mindis,numdis;

};

void*do_compute(void*arg)

{ Data* data=(Data*)arg;
compute(data->img1,data->img2,data->disp,data->mindis,data->numdis);
pthread_exit(NULL);

}

void ComputeDisparity(Mat &image1,Mat &image2,Mat &disparity,int mindis,int numdis,int numthread=1)

{
if(numthread>1)
{
int height = image1.rows;
int width = image1.cols;
disparity.create(image1.size(),CV_16S);
int thread_height=height/numthread;
int d = height*0.07;
Data data[numthread];
Rect forseg[numthread];
for(int i=0;i<numthread-1;i++)
{
forseg[i]={0,i*thread_height,width,thread_height+d};
}
int final_height=(numthread-1)*thread_height;
forseg[numthread-1]={0,final_height,width,height-final_height};
for(int i=0;i<numthread;i++)
{
image1(forseg[i]).copyTo(data[i].img1);
image2(forseg[i]).copyTo(data[i].img2);
data[i].mindis=mindis;
data[i].numdis=numdis;
}
pthread_t p_thread[numthread-1];
for (int i=0;i<numthread-1;i++)
{
int err;
err=pthread_create(&p_thread[i],NULL,do_compute,&data[i+1]);
if(err!=0)
cerr<<"Create pthread failed!\n";
}
compute(data[0].img1,data[0].img2,data[0].disp,data[0].mindis,data[0].numdis);
for (int i=0;i<numthread-1;i++)
{
pthread_join(p_thread[i],NULL);
}
Rect formerge[numthread+1];
for(int i=0;i<numthread-2;i++)
{
formerge[i]={0,(i+1)*thread_height+d,width,thread_height};
}
int final_merge_height=(numthread-1)*thread_height+d;
formerge[numthread-2]={0,final_merge_height,width,height-final_merge_height};
formerge[numthread-1]={0,d,width,thread_height};
formerge[numthread]={0,d,width,height-final_merge_height};
data[0].disp.copyTo(disparity(forseg[0]));
for(int i=1;i<numthread-1;i++)
{
data[i].disp(formerge[numthread-1]).copyTo(disparity(formerge[i-1]));
}
data[numthread-1].disp(formerge[numthread]).copyTo(disparity(formerge[numthread-2]));
}
else
compute(image1,image2,disparity,mindis,numdis);

}

archermind@flm:~/work/jni$ cat test.cpp

#include "opencv2/core/core.hpp"

#include "opencv2/highgui/highgui.hpp"

#include "opencv2/imgproc/imgproc.hpp"

#include <iostream>

using namespace std;

using namespace cv;

void ComputeDisparity(Mat &image1,Mat &image2,Mat &disparity,int mindis,int numdis,int numthread=1);

int main(){
Mat img1 = imread("tsukuba1color.png", 0);
Mat img2 = imread("tsukuba2color.png", 0);
Mat disparity,s;
for(int i=1;i<2;i++){
int64 t1=getTickCount();
ComputeDisparity(img1,img2,disparity,-16,16);
t1=getTickCount()-t1;
cout<<"1 threads:"<<t1/getTickFrequency()<<endl;
int64 t2=getTickCount();
ComputeDisparity(img1,img2,disparity,-16,16,2);
t2=getTickCount()-t2;
cout<<"2 threads:"<<t2/getTickFrequency()<<endl;
int64 t3=getTickCount();
ComputeDisparity(img1,img2,disparity,-16,16,3);
t3=getTickCount()-t3;
cout<<"3 threads:"<<t3/getTickFrequency()<<endl;
int64 t4=getTickCount();
ComputeDisparity(img1,img2,disparity,-16,16,4);
t4=getTickCount()-t4;
cout<<"4 threads:"<<t4/getTickFrequency()<<endl;
int64 t5=getTickCount();
ComputeDisparity(img1,img2,disparity,-16,16,5);
t5=getTickCount()-t5;
cout<<"5 threads:"<<t5/getTickFrequency()<<endl;
}
normalize(disparity, s, 0, 255, CV_MINMAX, CV_8U);
return 0;

}

archermind@flm:~/work/jni$ 

archermind@flm:~/work/jni$ cat x_cost.s 
.arm
.text
.global x_cost

x_cost:
push {r4-r6}
ldr r12,[sp,#12]@Sp
ldr r4,[sp,#16]@d8
ldr r5,[sp,#20] @D
ldr r6,[sp,#24] @minvalue
lsr r5,r5,#3
vdup.16       q0,r0       @delta0
mov           r0,#200     
vdup.16       q1,r0       @p1
mov         r0,#255
lsl    r0,r0,#7
add         r0,r0,#127
vdup.16
q9,r0    @minL0
vdup.16
q10,r0  @mins
mov r0,#-1
vdup.16
q11,r0  @bestdisp
vld1.16
{q12},[r4] @d8
mov r0,#8
vdup.16
q13,r0 @_8

.loop:
vld1.16       {q2},[r1]!    @Cp[d]
vld1.16       {q3},[r2]    @Lr_p0[d]
vmin.s16
  q8,q3,q0 @min(delta0,Lr_p0[d])
sub r2,r2,#2
vld1.16
{q3},[r2] @Lr_p0[d-1]
vqadd.s16
q3,q3,q1 @Lr_p0[d-1]+P1
vmin.s16
q8,q8,q3 @min(delta0,Lr_p0[d],Lr_p0[d-1]+P1)
add r2,r2,#4
vld1.16
{q3},[r2] @Lr_p0[d+1]
vqadd.s16
q3,q3,q1 @Lr_p0[d+1]+P1
vmin.s16
q8,q8,q3 @min(delta0,Lr_p0[d],Lr_p0[d-1]+P1,Lr_p0[d+1]+P1)
vqsub.s16
q8,q8,q0 @minus delta0
vqadd.s16
q8,q8,q2 @L0
vst1.16
{q8},[r3]! @Lr_p[d]
vmin.s16
q9,q9,q8 @min(minL0,L0)
vld1.16
{q2},[r12] @Sp[d]
vqadd.s16
q2,q2,q8 @Sp[d]=Sp[d]+L0
vst1.16
{q2},[r12]! 
vcgt.s16
q3,q10,q2 @mask
vmin.s16
q10,q10,q2 @min(mins,sp[d])
veor q8,q11,q12@xor(bestdisp,d8)
vand q8,q8,q3
veor q11,q11,q8@bestdisp
vqadd.s16
q12,q12,q13 @d8+8
add r2,r2, #14
subs r5,r5,#1
bne         .loop

vmin.s16
d0,d18,d19
vmin.s16
d1,d20,d21
vtrn.16
d0,d1
vmin.s16
d0,d0,d1
vshr.s64
d2,d0,#32
vmin.s16
d0,d2,d0
vst1.16
{d0},[r6]!

vdup.16
q3,d0[1]
vceq.s16
q3,q10,q3 @qs
vand q2,q11,q3
vst1.16
{q2},[r6]!
pop           {r4-r6}
bx            lr
.end

archermind@flm:~/work/jni$ ndk-build

Android NDK: OpenCV: You should ignore warning about 'non-system libraries in linker flags' and 'opencv_java' library.    

Android NDK:         'OPENCV_INSTALL_MODULES:=on' can be used to build APK with included OpenCV binaries    

Android NDK: WARNING:/home/archermind/work/jni/Android.mk:test: non-system libraries in linker flags: -lopencv_java3    

Android NDK:     This is likely to result in incorrect builds. Try using LOCAL_STATIC_LIBRARIES    

Android NDK:     or LOCAL_SHARED_LIBRARIES instead to list the library dependencies of the    

Android NDK:     current module    

[armeabi-v7a ] Compile arm    : test <= cost_cal.s

[armeabi-v7a ] Compile arm    : test <= cost_final.s

[armeabi-v7a ] Compile arm    : test <= cost_init.s

[armeabi-v7a ] Compile arm    : test <= l_cost.s

[armeabi-v7a ] Compile arm    : test <= x_cost.s

[armeabi-v7a ] Compile++ arm  : test <= test.cpp

[armeabi-v7a ] Compile++ arm  : test <= sgbm.cpp

[armeabi-v7a ] Executable     : test

[armeabi-v7a ] Install        : test => jni/test

archermind@flm:~/work/jni$ 

在开发板上执行的结果

amt6757_wifi_n:/data # ./test

1 threads:0.104967

2 threads:0.0807542

3 threads:0.0708366

4 threads:0.0683645

5 threads:0.0744135

amt6757_wifi_n:/data #

我这里的左右图的大小都是 384X288
https://github.com/southmountain/SgbmwithNeon
文件提取链接
http://pan.baidu.com/s/1o8KF8tO
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  ndk Android.mk .s neon opencv