您的位置:首页 > 编程语言 > C语言/C++

在安卓机上测试内联汇编和直接写成.s文件以及C语言的速率比较

2017-11-14 12:15 417 查看
#include "opencv2/core/core.hpp"

#include "opencv2/highgui/highgui.hpp"

#include <chrono>

#include <arm_neon.h>

#include "opencv2/imgproc/imgproc.hpp"

#define ARRAY_SIZE (10240)

extern "C" void add_mine(int* dst, int* src1, int* src2, int count);

void add_float_neon2(int* dst, int* src1, int* src2, int count);

class q_timer { 

  public: 
  void start()
  {
m_start = std::chrono::steady_clock::now();
  }

  double stop()
  {
            std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
            return  std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
  }

void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}

void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}

private: 
std::chrono::steady_clock::time_point m_start;

 };

void add_int_c(int* dst, int* src1, int* src2, int count)

{

  int i;

  for (i = 0; i < count; i++)

    dst[i] = src1[i] + src2[i];

}

using namespace cv;

int main()

{
q_timer time;

int a[ARRAY_SIZE]={0};
int b[ARRAY_SIZE]={0};
int c[ARRAY_SIZE]={0};
int d[ARRAY_SIZE]={0};
int e[ARRAY_SIZE]={0};

for(int i=0;i<ARRAY_SIZE;i++)
{
 a[i]=i;b[i]=i;d[i]=i+1;
}

time.start();
for(int i=0;i<10000;i++)
add_int_c(c, a, b,ARRAY_SIZE);
time.time_display(" c ");    
printf("c[1]=%d\n",c[1]);
printf("c[300]=%d\n",c[300]);
printf("c[600]=%d\n",c[600]);
printf("c[1023]=%d\n",c[1023]);

time.start();
for(int j=0;j<10000;j++)
add_mine(e, a, d,ARRAY_SIZE);
time.time_display(" asm ");
printf("e[1023]=%d\n",e[1023]);

time.start();
for(int j=0;j<10000;j++){
e[0]=e[1]=j;
add_float_neon2(e, a, d,ARRAY_SIZE);
}

time.time_display("inline asm ");

printf("e[1]=%d\n",e[1]);
printf("e[300]=%d\n",e[300]);
printf("e[600]=%d\n",e[600]);
printf("e[1023]=%d\n",e[1023]);
return 0;

}

void add_float_neon2(int* dst, int* src1, int* src2, int count)

{

  asm volatile (
".align 4\t\n"

    "1:\t\n"

    "vld1.32         {q0}, [%[src1]]!\t\n"

    "vld1.32         {q1}, [%[src2]]!\t\n"

    "vadd.s32        q0, q0, q1\t\n"

    "subs            %[count],  %[count],#4\t\n"

    "vst1.32         {q0}, [%[dst]]!\t\n"

    "bgt             1b\t\n"

    : [dst] "+r" (dst)

    : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)

    : "memory", "q0", "q1"

  );
}

Android.mk

LOCAL_PATH:= $(call my-dir)  

include $(CLEAR_VARS)  

NDK_APP_DST_DIR := $(LOCAL_PATH)

include /home/archermind/OpenCV-android-sdk/sdk/native/jni/OpenCV.mk 

LOCAL_SRC_FILES := add.s  speed.cpp

#LOCAL_CFLAGS := -D__cpusplus -Wall -O0 -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a7

LOCAL_CFLAGS := -D__cpusplus -O3 -g -mfloat-abi=softfp -mfpu=neon  -march=armv7-a -mtune=cortex-a53

TARGET_ARCH_ABI :=armeabi-v7a 

LOCAL_ARM_MODE := arm

LOCAL_ARM_NEON := true 

LOCAL_MODULE := t

include $(BUILD_EXECUTABLE)

amt6757_wifi_n:/data # ./t

Running time ( c ) is: 1.58710 Seconds.

c[1]=2

c[300]=600

c[600]=1200

c[1023]=2046

Running time ( asm ) is: 0.27039 Seconds.

e[1023]=2047

Running time (inline asm ) is: 0.26590 Seconds.

两种汇编速率差不多,他们都大约是c语言的5.9倍

在mtk6582上测试为6.2倍

代码链接:http://pan.baidu.com/s/1mieMKrQ
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐