我在arm上通过neon的函数来优化矩阵所有点求和,参考的资料是《DEN0018A_neon_programmers_guide》
float matSum(cv::Mat A)
{
float res;
float32_t result;
float32x2_t veca,vecb;
float32x4_t vec=vdupq_n_f32(0);
int w=A.cols;
int h=A.rows;
for(int j=0;j<h;j++)
{
int jdx=j*w;
for(int i=0;i<w;i+=4)
{
int idx=jdx+i;
float32_t* temp=(float32_t*) A.data+idx;
float32x4_t tempi = vld1q_f32(temp);
vec=vaddq_f32(vec,tempi);
}
veca=vget_low_f32(vec);
vecb=vget_high_f32(vec);
veca=vadd_f32(veca,vecb);
result=vget_lane_f32(veca,0);
result+=vget_lane_f32(veca,1);
}
res=result;
return res;
}
因篇幅问题不能全部显示,请点此查看更多更全内容