ARM Neon 列子 - Vector Add
2015-04-02 14:01
309 查看
/////////////////////////////////////////////////////////////////// // ARM Neon /* * FIR 滤波器 * y(n)=h(0)x(n) + h(1)x(n-1) + h(2)x(n-2) + ... h(N-1)x(n-N-1) * */ void TaskManger::fir_filter_c(short * y, const short *x, const short *h, int n_out, int n_coefs) { int n; for (n = 0; n < n_out; n++) { int k, sum = 0; for(k = 0; k < n_coefs; k++) { sum += h[k] * x[n - n_coefs + 1 + k]; } y = ((sum>>15) + 1) >> 1; } } void fir_filter_neon(short * y, const short *x, const short *h, int n_out, int n_coefs) { int n, k; int sum; int16x4_t h_vec; int16x4_t x_vec; int32x4_t result_vec; for (n = 0; n < n_out; n++) { /* Clear the scalar and vector sums */ sum = 0; result_vec = vdupq_n_s32(0); /* vdup -> duplicates a scalar into every element of the destination vector */ for(k = 0; k < n_coefs / 4; k++) { /* Four vector multiply-accumulate operations in parallel */ h_vec = vld1_s16(&h[k*4]); x_vec = vld1_s16(&x[n - n_coefs + 1 + k*4]); result_vec = vmlal_s16(result_vec, h_vec, x_vec); } /* Reduction operation - add each vector lane result to the sum */ sum += vgetq_lane_s32(result_vec, 0); sum += vgetq_lane_s32(result_vec, 1); sum += vgetq_lane_s32(result_vec, 2); sum += vgetq_lane_s32(result_vec, 3); /* consume the last few data using scalar operations */ if(n_coefs % 4) { for(k = n_coefs - (n_coefs % 4); k < n_coefs; k++) sum += h[k] * x[n - n_coefs + 1 + k]; } /* Store the adjusted result */ y = ((sum>>15) + 1) >> 1; } } /* * RGB转灰度图像 */ void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; for (i=0; i<n; i++) { int r = *src++; // load red int g = *src++; // load green int b = *src++; // load blue // build weighted average: int y = (r*77)+(g*151)+(b*28); // undo the scale by 256 and write to memory: *dest++ = (y>>8); } } void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; uint8x8_t rfac = vdup_n_u8 (77); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (28); n /= 8; for (i=0; i<n; i++) { uint16x8_t temp; uint8x8x3_t rgb = vld3_u8 (src); // vld3 ->loads 3 vectors from memory. 就是 uint8x8_t val[3] uint8x8_t result; temp = vmull_u8 (rgb.val[0], rfac); // 长指令相乘 temp = vmlal_u8 (temp,rgb.val[1], gfac); temp = vmlal_u8 (temp,rgb.val[2], bfac); result = vshrn_n_u16 (temp, 8); // vshrn -> ri = ai >> b; 移位指针 vst1_u8 (dest, result); // vst1 -> stores a vector into memory src += 8*3; dest += 8; } } /* * Vector add */ uint32_t TaskManger::vector_add_c(uint32_t *s, uint32_t n) {// assert n % 4 == 0 uint32_t sum = 0; for(uint32_t i = 0; i < n; i++) sum += *s++; return sum; } uint32_t TaskManger::vector_add_neon(uint32_t *s, uint32_t n) {// assert n % 4 == 0 uint32_t sum, *i; uint32x2_t vec64a, vec64b; uint32x4_t vec128 = vdupq_n_u32(0); for (i = s; i < (s + n); i += 4) { uint32x4_t temp128 = vld1q_u32(i); // 从内存加载一个向量vector, vld1 -> loads a vector from memory vec128 = vaddq_u32(vec128, temp128); // 向量相加uint32x4_t相加,一次加4个uint32_t, vector add, vadd -> ri = ai + bi } vec64a = vget_low_u32(vec128); // returns the lower half of the 128-bit input vector. vec64b = vget_high_u32(vec128); // returns the higher half of the 128-bit input vector. vec64a = vadd_u32 (vec64a, vec64b); // 再将 uint32x2_t 向量相加 sum = vget_lane_u32(vec64a, 0); // returns the value from the specified lane of a vector. sum += vget_lane_u32(vec64a, 1); return sum; } void TaskManger::neon_demo() { //1, demo 1 #define ELE 65536 uint32_t arry[ELE]; struct timeval tpstart,tpend; float timeuse; printf("run neon demo 1, vector add\n"); srand((unsigned int) time(NULL)); for(int i = 0; i < ELE; i++) { arry[i] = rand() % 500; } // c/c++ gettimeofday(&tpstart,NULL); uint32_t result2 = vector_add_c(arry, ARRAY_SIZE(arry)); gettimeofday(&tpend,NULL); timeuse = 1000000*(tpend.tv_sec-tpstart.tv_sec)+ tpend.tv_usec-tpstart.tv_usec; printf("generator loop c add result:%d, used time:%f ms\n\n\n", result2, timeuse); // neon gettimeofday(&tpstart,NULL); uint32_t result1 = vector_add_neon(arry, ARRAY_SIZE(arry)); gettimeofday(&tpend,NULL); timeuse = 1000000*(tpend.tv_sec-tpstart.tv_sec)+ tpend.tv_usec-tpstart.tv_usec; printf("neon vector add result:%d, used time:%f ms\n\n\n", result1, timeuse); }
相关文章推荐
- ARM NEON 指令
- ARM NEON 优化
- -03-实时Prewitt边缘检测,第一步:硬件平台搭建【ARM NEON加速】
- ARM和NEON指令 very nice
- 利用ndk交叉编译x264到arm平台(带neon版本)
- 【翻译】(8)CPU ARM Neon
- 手撕Arm NEON 汇编 (引入)
- NEON arm 社区
- Android NDK编译带ARM neon优化的库
- -04-实时Prewitt边缘检测,第二步:Prewitt的NEON加速实现【ARM NEON加速】
- ARM和NEON指令
- 【翻译】(8)CPU ARM Neon
- ARM-NEON
- ARM NEON 编程系列7——NEON gcc编译器intrinsics函数对应的汇编指令
- 【优化】ARM-NEON优化介绍
- [ARM]【编译】【实践】 - 浮点编译选项NEON引发的Skia的库Illegal instruction运行错误和解决办法
- NEON----ARM通用 SIMD 引擎
- arm neon RGB转Gray的例子
- ARM NEON 编程系列8——ARM NEON 优化
- FFmpeg H.265解码器亮度四分之一像素插值ARM NEON代码阅读