在安卓机上测试内联汇编和直接写成.s文件以及C语言的速率比较
2017-11-14 12:15
417 查看
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
#define ARRAY_SIZE (10240)
extern "C" void add_mine(int* dst, int* src1, int* src2, int count);
void add_float_neon2(int* dst, int* src1, int* src2, int count);
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
void add_int_c(int* dst, int* src1, int* src2, int count)
{
int i;
for (i = 0; i < count; i++)
dst[i] = src1[i] + src2[i];
}
using namespace cv;
int main()
{
q_timer time;
int a[ARRAY_SIZE]={0};
int b[ARRAY_SIZE]={0};
int c[ARRAY_SIZE]={0};
int d[ARRAY_SIZE]={0};
int e[ARRAY_SIZE]={0};
for(int i=0;i<ARRAY_SIZE;i++)
{
a[i]=i;b[i]=i;d[i]=i+1;
}
time.start();
for(int i=0;i<10000;i++)
add_int_c(c, a, b,ARRAY_SIZE);
time.time_display(" c ");
printf("c[1]=%d\n",c[1]);
printf("c[300]=%d\n",c[300]);
printf("c[600]=%d\n",c[600]);
printf("c[1023]=%d\n",c[1023]);
time.start();
for(int j=0;j<10000;j++)
add_mine(e, a, d,ARRAY_SIZE);
time.time_display(" asm ");
printf("e[1023]=%d\n",e[1023]);
time.start();
for(int j=0;j<10000;j++){
e[0]=e[1]=j;
add_float_neon2(e, a, d,ARRAY_SIZE);
}
time.time_display("inline asm ");
printf("e[1]=%d\n",e[1]);
printf("e[300]=%d\n",e[300]);
printf("e[600]=%d\n",e[600]);
printf("e[1023]=%d\n",e[1023]);
return 0;
}
void add_float_neon2(int* dst, int* src1, int* src2, int count)
{
asm volatile (
".align 4\t\n"
"1:\t\n"
"vld1.32 {q0}, [%[src1]]!\t\n"
"vld1.32 {q1}, [%[src2]]!\t\n"
"vadd.s32 q0, q0, q1\t\n"
"subs %[count], %[count],#4\t\n"
"vst1.32 {q0}, [%[dst]]!\t\n"
"bgt 1b\t\n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "q0", "q1"
);
}
Android.mk
LOCAL_PATH:= $(call my-dir)
include $(CLEAR_VARS)
NDK_APP_DST_DIR := $(LOCAL_PATH)
include /home/archermind/OpenCV-android-sdk/sdk/native/jni/OpenCV.mk
LOCAL_SRC_FILES := add.s speed.cpp
#LOCAL_CFLAGS := -D__cpusplus -Wall -O0 -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a7
LOCAL_CFLAGS := -D__cpusplus -O3 -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a53
TARGET_ARCH_ABI :=armeabi-v7a
LOCAL_ARM_MODE := arm
LOCAL_ARM_NEON := true
LOCAL_MODULE := t
include $(BUILD_EXECUTABLE)
amt6757_wifi_n:/data # ./t
Running time ( c ) is: 1.58710 Seconds.
c[1]=2
c[300]=600
c[600]=1200
c[1023]=2046
Running time ( asm ) is: 0.27039 Seconds.
e[1023]=2047
Running time (inline asm ) is: 0.26590 Seconds.
两种汇编速率差不多,他们都大约是c语言的5.9倍
在mtk6582上测试为6.2倍
代码链接:http://pan.baidu.com/s/1mieMKrQ
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
#define ARRAY_SIZE (10240)
extern "C" void add_mine(int* dst, int* src1, int* src2, int count);
void add_float_neon2(int* dst, int* src1, int* src2, int count);
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
void add_int_c(int* dst, int* src1, int* src2, int count)
{
int i;
for (i = 0; i < count; i++)
dst[i] = src1[i] + src2[i];
}
using namespace cv;
int main()
{
q_timer time;
int a[ARRAY_SIZE]={0};
int b[ARRAY_SIZE]={0};
int c[ARRAY_SIZE]={0};
int d[ARRAY_SIZE]={0};
int e[ARRAY_SIZE]={0};
for(int i=0;i<ARRAY_SIZE;i++)
{
a[i]=i;b[i]=i;d[i]=i+1;
}
time.start();
for(int i=0;i<10000;i++)
add_int_c(c, a, b,ARRAY_SIZE);
time.time_display(" c ");
printf("c[1]=%d\n",c[1]);
printf("c[300]=%d\n",c[300]);
printf("c[600]=%d\n",c[600]);
printf("c[1023]=%d\n",c[1023]);
time.start();
for(int j=0;j<10000;j++)
add_mine(e, a, d,ARRAY_SIZE);
time.time_display(" asm ");
printf("e[1023]=%d\n",e[1023]);
time.start();
for(int j=0;j<10000;j++){
e[0]=e[1]=j;
add_float_neon2(e, a, d,ARRAY_SIZE);
}
time.time_display("inline asm ");
printf("e[1]=%d\n",e[1]);
printf("e[300]=%d\n",e[300]);
printf("e[600]=%d\n",e[600]);
printf("e[1023]=%d\n",e[1023]);
return 0;
}
void add_float_neon2(int* dst, int* src1, int* src2, int count)
{
asm volatile (
".align 4\t\n"
"1:\t\n"
"vld1.32 {q0}, [%[src1]]!\t\n"
"vld1.32 {q1}, [%[src2]]!\t\n"
"vadd.s32 q0, q0, q1\t\n"
"subs %[count], %[count],#4\t\n"
"vst1.32 {q0}, [%[dst]]!\t\n"
"bgt 1b\t\n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "q0", "q1"
);
}
Android.mk
LOCAL_PATH:= $(call my-dir)
include $(CLEAR_VARS)
NDK_APP_DST_DIR := $(LOCAL_PATH)
include /home/archermind/OpenCV-android-sdk/sdk/native/jni/OpenCV.mk
LOCAL_SRC_FILES := add.s speed.cpp
#LOCAL_CFLAGS := -D__cpusplus -Wall -O0 -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a7
LOCAL_CFLAGS := -D__cpusplus -O3 -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a53
TARGET_ARCH_ABI :=armeabi-v7a
LOCAL_ARM_MODE := arm
LOCAL_ARM_NEON := true
LOCAL_MODULE := t
include $(BUILD_EXECUTABLE)
amt6757_wifi_n:/data # ./t
Running time ( c ) is: 1.58710 Seconds.
c[1]=2
c[300]=600
c[600]=1200
c[1023]=2046
Running time ( asm ) is: 0.27039 Seconds.
e[1023]=2047
Running time (inline asm ) is: 0.26590 Seconds.
两种汇编速率差不多,他们都大约是c语言的5.9倍
在mtk6582上测试为6.2倍
代码链接:http://pan.baidu.com/s/1mieMKrQ
相关文章推荐
- 文件拷贝过程中使用文件流、缓冲流、转换流以及速率比较
- 详解keil采用C语言模块化编程时全局变量、结构体的定义、声明以及头文件包含的处理方法!
- open/read/write/close等文件系统调用接口以及fd与FILE的比较
- C语言对文件的读写操作以及处理CSV文件的方法
- 关于c语言不同文件之间直接函数接口引用的简介
- java计算文件大小三种方法以及优缺点比较
- 关于cout,wcout输出的测试,以及printf,wprintf 输出中文,内存中直接输出图像给网页问题
- C语言之include以及多文件团队开发
- C语言一个文件中的函数能直接调用另外一个文件中的静态函数吗? (某公司校园招聘面试试题)
- PAIP.测试硬盘的成色以及速率
- C语言读取配置文件以及128字节对齐.bin配置文件
- 参考:c语言中float为什么不能直接用==与0比较
- C语言把文件读入字符串以及将字符串写入文件
- Java通用的Excel文件生成工具类,支持生成文件和浏览器直接下载(未测试)
- 转:几种基于HTTP协议的RPC性能比较,以及ICE接口性能测试比较
- C语言笔试题精选1---求两个数之间较大的数,不使用if、while、switch、for、?:/以及任何比较语句
- eval执行效率测试 ——直接调用方法传递参数和通过eval方法调用并传递参数 耗时比较
- Remoting、Reflection以及本地调用的性能测试比较
- ID 比较练习 C语言 文件操作 位置指针
- shell编程:引用变量、内部变量、条件测试、字符串比较、整数比较、文件比较、逻辑操作、引号、通配符、调试执行