您的位置:首页 > 编程语言 > C语言/C++

ICL Auto Vectorization

2015-11-22 12:31 441 查看

简介

此文简单介绍如何使用intel c++编译器实现向量化加速。

全文如下安排:

base : 待优化的源代码。

vectorization : 第一个向量化版本。

aligned : 内存对其对向量化的影响。

base

base版本代码:

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;

int64_t cpu_freq;
int64_t cpu_counter(){
int64_t clock;
QueryPerformanceCounter((LARGE_INTEGER*)&clock);
return clock;
}

// output time
#if 1
int64_t gloabel_timer_begin;
int64_t gloabel_timer_end;
#define TB__ gloabel_timer_begin=cpu_counter()
#define TE__ gloabel_timer_end  =cpu_counter(); \
cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
#define TB__
#define TE__
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data
void init(float *data, int rows, int cols, int true_cols){
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++){
data[i*true_cols+j] = float(rand())/float(RAND_MAX);
}
}
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
float total = 0;
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++){
total += data[i*true_cols+j];
}
}
cout << total << endl;
}

int main(){
QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

int rows = 100;
int cols = 101;

int true_cols = cols;
float *A = (float*)malloc(rows*true_cols*sizeof(float));
float *B = (float*)malloc(rows*sizeof(float));
float *C = (float*)malloc(rows*sizeof(float));

init(A, rows, cols, true_cols);
init(B, rows, 1, 1);

// computing
TB__;
for (int k = 0; k < REPEATTIMES; k++){
multiply(C, A, B, rows, cols, true_cols);
}
TE__;

// print result.
print_sum(C, rows, 1, 1);

free(A); A = NULL;
free(B); B = NULL;
free(C); C = NULL;

return 0;
}


// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
for (int i = 0; i < rows; i++){
C[i] = 0;
for (int j = 0; j < cols; j++){
C[i] += A[i*true_cols+j]*B[j];
}
}
}


编译:

user@machine> icl /O1 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


执行:

user@machine> main.exe
73 : 0.877882 seconds
2483.53


vectorization

源代码保持不变

编译:

user@machine> icl /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


执行:

user@machine> main.exe
73 : 0.205989 seconds
2483.53


执行速度提升了 4倍左右。

aligned

源代码修改。(注意:下面的代码有问题,结果可能有错误,原因可能是内存的问题。

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;

int64_t cpu_freq;
int64_t cpu_counter(){
int64_t clock;
QueryPerformanceCounter((LARGE_INTEGER*)&clock);
return clock;
}

// output time
#if 1
int64_t gloabel_timer_begin;
int64_t gloabel_timer_end;
#define TB__ gloabel_timer_begin=cpu_counter()
#define TE__ gloabel_timer_end  =cpu_counter(); \
cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
#define TB__
#define TE__
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data
void init(float *data, int rows, int cols, int true_cols){
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++){
data[i*true_cols+j] = float(rand())/float(RAND_MAX);
}
}
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
float total = 0;
for (int i = 0; i < rows; i++){
for (int j = 0; j < cols; j++){
total += data[i*true_cols+j];
}
}
cout << total << endl;
}

int main(){
QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

int rows = 100;
int cols = 101;

#ifdef ALIGNED
#define ALLIGNED_LEN 32
int true_cols = ((((cols*sizeof(float))+ALLIGNED_LEN-1)/ALLIGNED_LEN)*ALLIGNED_LEN)/sizeof(float);
//cout << true_cols << endl;
float *A = (float*)_aligned_malloc(rows*true_cols*sizeof(float), ALLIGNED_LEN);
float *B = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
float *C = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
#else
int true_cols = cols;
float *A = (float*)malloc(rows*true_cols*sizeof(float));
float *B = (float*)malloc(rows*sizeof(float));
float *C = (float*)malloc(rows*sizeof(float));
#endif

init(A, rows, cols, true_cols);
init(B, rows, 1, 1);

// computing
TB__;
for (int k = 0; k < REPEATTIMES; k++){
multiply(C, A, B, rows, cols, true_cols);
}
TE__;

// print result.
print_sum(C, rows, 1, 1);

#ifdef ALIGNED
_aligned_free(A); A = NULL;
_aligned_free(B); B = NULL;
_aligned_free(C); C = NULL;
#else
free(A); A = NULL;
free(B); B = NULL;
free(C); C = NULL;
#endif

return 0;
}


// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
for (int i = 0; i < rows; i++){
C[i] = 0;
#ifdef ALIGNED
#pragma vector aligned
#endif
for (int j = 0; j < cols; j++){
C[i] += A[i*true_cols+j]*B[j];
}
}
}


编译:

user@machine> icl /DALIGNED /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp


执行:

82 : 0.17747 seconds
2483.53


相对第一个优化的版本又提升了一点速度。

结论

vectorization版本:不需要改变源代码,通过修改编译器选项直接实现向量化。

aligned版本:需要修改代码,使得内存对其。可以进一步获得性能。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息