neon内嵌汇编实现一个yuv转rgb的功能stopped原因
2017-11-27 11:00
1261 查看
我在C语言里面调用内嵌的neon汇编的时候,算法运行两次后发生stopped错误,最后发现是内嵌的参数属性错误,于是我调换了参数的位置和某个参数的读写属性
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
using namespace cv;
using namespace std;
struct YUVBufferAddr{
unsigned char* Yaddr;
unsigned char* Uaddr;
unsigned char* Vaddr;
};
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
void ImgYUV2RGB24_neon(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
int l32Width,
int l32Height,
unsigned char* u ,
unsigned char* v)
{
asm volatile (
"add r4, %2, %2 , lsl #1 \n"
"mul r5, r4, %3 \n"
"sub r5, r5, r4 \n"
"mul r5, %2, %3 \n"
"add %4, %1, r5 \n"
"add %5, %4, r5, lsr #2 \n"
"mov r8, %2, lsr #3 \n"
"mov r11, %3, lsr #1 \n"
"add %3, %1, %2 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2: \n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%3]! \n"
"vld1.32 {d4[0]}, [%4]! \n"
"vld1.32 {d4[1]}, [%5]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r11, r11, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %2 \n"
"add %3, %3, %2 \n"
"mov r8, %2, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(l32Width),// %2 readonly
"+r"(l32Height), // %3
"+r"(u), // %4
"+r"(v) // %5
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15","r4","r5","r8","r9","r10","r11"
);
}
void ImgYUV2RGB24_neoncopy(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
unsigned char* u ,
unsigned char* v,
int l32Height,
int l32Width)
{
asm volatile (
"add r4, %5, %5 , lsl #1 \n"
"mul r5, r4, %4 \n"
"sub r5, r5, r4 \n"
"mul r5, %5, %4 \n"
"add %2, %1, r5 \n"
"add %3, %2, r5, lsr #2 \n"
"mov r8, %5, lsr #3 \n"
"mov r6, %4, lsr #1 \n"
"add %4, %1, %5 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2:
\n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%4]! \n"
"vld1.32 {d4[0]}, [%2]! \n"
"vld1.32 {d4[1]}, [%3]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r6, r6, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %5 \n"
"add %4, %4, %5 \n"
"mov r8, %5, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(u), // %2
"+r"(v), // %3
"+r"(l32Height) // %4
: "r"(l32Width) // %5 readonly
: "cc", "memory","r4","r5","r8","r9","r10","r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
int main()
{
int width;
int height;
unsigned char *main_camera = NULL;
unsigned char *dest = NULL;
std::FILE *f_left = NULL;
f_left = std::fopen("/data/MV_F_Cap1_3000.yuv", "rb");
if (NULL == f_left )
return -1;
width = 4208;
height = 3120;
main_camera = new unsigned char[width*height*3/2];
std::fread(&main_camera[0], sizeof(char), height*width*3/2, f_left);
fclose(f_left); f_left = NULL;
YUVBufferAddr main_addr;
main_addr.Yaddr = main_camera;
main_addr.Uaddr = &main_camera[height*width];
main_addr.Vaddr = &main_camera[height*width + height*width / 4];
dest = new unsigned char[4208*3120*3];
q_timer time;
cv::Mat dst = cv::Mat(height,width, CV_8UC3, dest);
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
cv::imwrite("rgb23.png", dst);
return 0;
}
代码下载链接:yuv2rgb_stopped.tar.gz
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
using namespace cv;
using namespace std;
struct YUVBufferAddr{
unsigned char* Yaddr;
unsigned char* Uaddr;
unsigned char* Vaddr;
};
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
void ImgYUV2RGB24_neon(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
int l32Width,
int l32Height,
unsigned char* u ,
unsigned char* v)
{
asm volatile (
"add r4, %2, %2 , lsl #1 \n"
"mul r5, r4, %3 \n"
"sub r5, r5, r4 \n"
"mul r5, %2, %3 \n"
"add %4, %1, r5 \n"
"add %5, %4, r5, lsr #2 \n"
"mov r8, %2, lsr #3 \n"
"mov r11, %3, lsr #1 \n"
"add %3, %1, %2 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2: \n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%3]! \n"
"vld1.32 {d4[0]}, [%4]! \n"
"vld1.32 {d4[1]}, [%5]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r11, r11, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %2 \n"
"add %3, %3, %2 \n"
"mov r8, %2, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(l32Width),// %2 readonly
"+r"(l32Height), // %3
"+r"(u), // %4
"+r"(v) // %5
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15","r4","r5","r8","r9","r10","r11"
);
}
void ImgYUV2RGB24_neoncopy(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
unsigned char* u ,
unsigned char* v,
int l32Height,
int l32Width)
{
asm volatile (
"add r4, %5, %5 , lsl #1 \n"
"mul r5, r4, %4 \n"
"sub r5, r5, r4 \n"
"mul r5, %5, %4 \n"
"add %2, %1, r5 \n"
"add %3, %2, r5, lsr #2 \n"
"mov r8, %5, lsr #3 \n"
"mov r6, %4, lsr #1 \n"
"add %4, %1, %5 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2:
\n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%4]! \n"
"vld1.32 {d4[0]}, [%2]! \n"
"vld1.32 {d4[1]}, [%3]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r6, r6, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %5 \n"
"add %4, %4, %5 \n"
"mov r8, %5, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(u), // %2
"+r"(v), // %3
"+r"(l32Height) // %4
: "r"(l32Width) // %5 readonly
: "cc", "memory","r4","r5","r8","r9","r10","r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
int main()
{
int width;
int height;
unsigned char *main_camera = NULL;
unsigned char *dest = NULL;
std::FILE *f_left = NULL;
f_left = std::fopen("/data/MV_F_Cap1_3000.yuv", "rb");
if (NULL == f_left )
return -1;
width = 4208;
height = 3120;
main_camera = new unsigned char[width*height*3/2];
std::fread(&main_camera[0], sizeof(char), height*width*3/2, f_left);
fclose(f_left); f_left = NULL;
YUVBufferAddr main_addr;
main_addr.Yaddr = main_camera;
main_addr.Uaddr = &main_camera[height*width];
main_addr.Vaddr = &main_camera[height*width + height*width / 4];
dest = new unsigned char[4208*3120*3];
q_timer time;
cv::Mat dst = cv::Mat(height,width, CV_8UC3, dest);
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
cv::imwrite("rgb23.png", dst);
return 0;
}
代码下载链接:yuv2rgb_stopped.tar.gz
相关文章推荐
- yuv转rgb的汇编实现
- sw_scale中实现yuv420转rgb888——neon汇编优化
- 一个获取rgb的小程序,有兴趣的可以交流交流,好多地方还不够完美,不过功能可以实现(C# winForm)
- 【OpenGL】用OpenGL shader实现将YUV(YUV420,YV12)转RGB-(直接调用GPU实现,纯硬件方式,效率高) 这段时间一直在搞视频格式的转换问题,终于最近将一个图片的YUV
- 采用6个参数的C语言的汇编实现yuv转rgb
- JAVA编写的浏览器,在别人的基础上做了些许修改,实现了前进,后退,刷新功能,添加了一个搜索框,具体情况在运行结果中的文件-->注意中
- 【难】【队列】实现一个支持插入、删除和查找最大值三种功能的队列
- SDK编程:任务栏TaskBar与对话框Dialog实现一个简单功能
- 用两个栈实现一个队列的功能 && 两个队列实现一个栈的功能 && 代码实例
- C/C++之用两个栈实现一个队列的功能
- j2me 实现翻页功能的一个小例子
- 关于一个JS功能实现的思维方式
- 【Android游戏开发十六】Android Gesture之【触摸屏手势识别】操作!利用触摸屏手势实现一个简单切换图片的功能!
- 请编写一个Java程序,接收一个大于等于2的整数,实现下面功能。
- 基于PHP实现一个简单的在线聊天功能
- 完成一个学生管理程序,使用学号作为键添加5个学生对象,并可以将全部信息保存在文件中,可以实现对学生信息的学号查找,输出全部学生信息的功能。
- 用两个栈实现一个队列的功能
- 用C语言模拟实现一个通讯录,要求实现其添加、删除、修改、查找、显示和排序联系人信息的功能
- 一个mfc实现的简单计算功能
- 模仿qsort的功能实现一个通用的冒泡排序。