Added NEON implementation of shift_addfast_cc.

This commit is contained in:
ha7ilm 2015-11-29 19:05:28 +00:00
parent 4697928c38
commit d24164db03
4 changed files with 93 additions and 13 deletions

2
csdr.c
View file

@ -1274,8 +1274,6 @@ int main(int argc, char *argv[])
}
#endif
#define TIME_TAKEN(start,end) ((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/1e9)
if(!strcmp(argv[1],"fft_benchmark"))
{
if(argc<=3) return badsyntax("need required parameters (fft_size, fft_cycles)");

View file

@ -268,30 +268,108 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
shift_addfast_data_t shift_addfast_init(float rate)
{
shift_addfast_data_t output;
float phase_increment=2*rate*PI;
output.phase_increment=2*rate*PI;
for(int i=0;i<4;i++)
{
output.dsin[i]=sin(phase_increment*(i+1));
output.dcos[i]=cos(phase_increment*(i+1));
output.dsin[i]=sin(output.phase_increment*(i+1));
output.dcos[i]=cos(output.phase_increment*(i+1));
}
return output;
}
#ifdef NEON_OPTS
#pragma message "Manual NEON optimizations are ON: we have a faster shift_addfast_cc now."
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
//input_size should be multiple of 4
float phase=starting_phase;
float cos_start[4], sin_start[4];
float cos_vals[4], sin_vals[4];
for(int i=0;i<4;i++)
{
cos_start[i] = cos(starting_phase);
sin_start[i] = sin(starting_phase);
}
float* pdcos = d->dcos;
float* pdsin = d->dsin;
register float* pinput = (float*)input;
register float* pinput_end = ((float*)input)+input_size;
register float* poutput = (float*)output;
#define RDCOS "q0" //dcos, dsin
#define RDSIN "q1"
#define RCOSST "q2" //cos_start, sin_start
#define RSINST "q3"
#define RCOSV "q4" //cos_vals, sin_vals
#define RSINV "q5"
#define ROUTI "q6" //output_i, output_q
#define ROUTQ "q7"
#define RINPI "q8" //input_i, input_q
#define RINPQ "q9"
#define R3(x,y,z) x ", " y ", " z "\n\t"
asm volatile( //(the range is q0-015)
" vld1.32 {" RDCOS "}, [%[pdcos]]\n\t"
" vld1.32 {" RDSIN "}, [%[pdsin]]\n\t"
" vld1.32 {" RCOSST "}, [%[cos_start]]\n\t"
" vld1.32 {" RSINST "}, [%[sin_start]]\n\t"
"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark)
//C version:
//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
//sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];
" vmul.f32 " R3(RCOSV, RCOSST, RDCOS) //cos_vals[i] = cos_start * d->dcos[i]
" vmls.f32 " R3(RCOSV, RSINST, RDSIN) //cos_vals[i] -= sin_start * d->dsin[i]
" vmul.f32 " R3 (RSINV, RSINST, RDCOS) //sin_vals[i] = sin_start * d->dcos[i]
" vmla.f32 " R3(RCOSV, RSINST, RDSIN) //sin_vals[i] += cos_start * d->dsin[i]
//C version:
//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
" vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input
" vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input
" vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
" vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
" vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
" vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
" vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
" cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end)
" bcc for_fdccasm\n\t" // then goto for_fdcasm
:
[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
:
[pinput_end]"r"(pinput_end), [pdcos]"r"(pdcos), [pdsin]"r"(pdsin), [sin_start]"r"(sin_start), [cos_start]"r"(cos_start) //input operand list
:
"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
);
return phase+input_size*d->phase_increment;
}
#else
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
//input_size should be multiple of 4
//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
float phase=starting_phase;
float cos_start=cos(starting_phase);
float sin_start=sin(starting_phase);
float cos_vals[4], sin_vals[4];
for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
{
for(int j=0;j<4;j++)
for(int j=0;j<4;j++) //@shift_addfast_cc
{
cos_vals[i] = cos_start * d->dcos[i] - sin_start * d->dsin[i];
sin_vals[i] = sin_start * d->dcos[i] + cos_start * d->dsin[i];
cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];
}
for(int j=0;j<4;j++)
for(int j=0;j<4;j++) //@shift_addfast_cc
{
iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
@ -299,11 +377,13 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
cos_start = cos_vals[3];
sin_start = sin_vals[3];
}
return phase;
return phase+input_size*d->phase_increment;
}
#endif
#ifdef NEON_OPTS
#pragma message "We have a faster fir_decimate_cc now."
#pragma message "Manual NEON optimizations are ON: we have a faster fir_decimate_cc now."
//max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference

View file

@ -63,6 +63,8 @@ typedef struct complexf_s { float i; float q; } complexf;
//they dropped M_PI in C99, so we define it:
#define PI ((float)3.14159265358979323846)
#define TIME_TAKEN(start,end) ((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/1e9)
//window
typedef enum window_s
{
@ -160,7 +162,7 @@ typedef struct shift_addfast_data_s
{
float dsin[4];
float dcos[4];
float phase_increment;
} shift_addfast_data_t;
shift_addfast_data_t shift_addfast_init(float rate);
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);

View file

@ -78,7 +78,7 @@ int main()
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for(int i=0;i<T_N;i++) starting_phase = shift_addfast_cc(buf_c, outbuf_c, T_BUFSIZE, &data_addfast, starting_phase);
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
fprintf(stderr,"shift_addition_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
fprintf(stderr,"shift_addfast_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
}