From d24164db03072a8c7d1eee0a56b325123fd17d9d Mon Sep 17 00:00:00 2001 From: ha7ilm Date: Sun, 29 Nov 2015 19:05:28 +0000 Subject: [PATCH] Added NEON implementation of shift_addfast_cc. --- csdr.c | 2 -- libcsdr.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++----- libcsdr.h | 4 ++- test200.c | 2 +- 4 files changed, 93 insertions(+), 13 deletions(-) diff --git a/csdr.c b/csdr.c index bd74cf2..a1330e3 100644 --- a/csdr.c +++ b/csdr.c @@ -1274,8 +1274,6 @@ int main(int argc, char *argv[]) } #endif -#define TIME_TAKEN(start,end) ((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/1e9) - if(!strcmp(argv[1],"fft_benchmark")) { if(argc<=3) return badsyntax("need required parameters (fft_size, fft_cycles)"); diff --git a/libcsdr.c b/libcsdr.c index 09fe664..e3c8d33 100644 --- a/libcsdr.c +++ b/libcsdr.c @@ -268,30 +268,108 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra shift_addfast_data_t shift_addfast_init(float rate) { shift_addfast_data_t output; - float phase_increment=2*rate*PI; + output.phase_increment=2*rate*PI; for(int i=0;i<4;i++) { - output.dsin[i]=sin(phase_increment*(i+1)); - output.dcos[i]=cos(phase_increment*(i+1)); + output.dsin[i]=sin(output.phase_increment*(i+1)); + output.dcos[i]=cos(output.phase_increment*(i+1)); } return output; } +#ifdef NEON_OPTS +#pragma message "Manual NEON optimizations are ON: we have a faster shift_addfast_cc now." + float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) { //input_size should be multiple of 4 float phase=starting_phase; + float cos_start[4], sin_start[4]; + float cos_vals[4], sin_vals[4]; + for(int i=0;i<4;i++) + { + cos_start[i] = cos(starting_phase); + sin_start[i] = sin(starting_phase); + } + + float* pdcos = d->dcos; + float* pdsin = d->dsin; + register float* pinput = (float*)input; + register float* pinput_end = ((float*)input)+input_size; + register float* poutput = (float*)output; + + #define RDCOS "q0" //dcos, dsin + #define RDSIN "q1" + #define RCOSST "q2" //cos_start, sin_start + #define RSINST "q3" + #define RCOSV "q4" //cos_vals, sin_vals + #define RSINV "q5" + #define ROUTI "q6" //output_i, output_q + #define ROUTQ "q7" + #define RINPI "q8" //input_i, input_q + #define RINPQ "q9" + #define R3(x,y,z) x ", " y ", " z "\n\t" + + asm volatile( //(the range is q0-015) + " vld1.32 {" RDCOS "}, [%[pdcos]]\n\t" + " vld1.32 {" RDSIN "}, [%[pdsin]]\n\t" + " vld1.32 {" RCOSST "}, [%[cos_start]]\n\t" + " vld1.32 {" RSINST "}, [%[sin_start]]\n\t" + "for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark) + + //C version: + //cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j]; + //sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j]; + + " vmul.f32 " R3(RCOSV, RCOSST, RDCOS) //cos_vals[i] = cos_start * d->dcos[i] + " vmls.f32 " R3(RCOSV, RSINST, RDSIN) //cos_vals[i] -= sin_start * d->dsin[i] + " vmul.f32 " R3 (RSINV, RSINST, RDCOS) //sin_vals[i] = sin_start * d->dcos[i] + " vmla.f32 " R3(RCOSV, RSINST, RDSIN) //sin_vals[i] += cos_start * d->dsin[i] + + //C version: + //iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j); + //qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j); + " vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input + " vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input + " vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i] + " vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i] + + " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory + + " vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3] + " vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3] + + " cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end) + " bcc for_fdccasm\n\t" // then goto for_fdcasm + : + [pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM + : + [pinput_end]"r"(pinput_end), [pdcos]"r"(pdcos), [pdsin]"r"(pdsin), [sin_start]"r"(sin_start), [cos_start]"r"(cos_start) //input operand list + : + "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list + ); + + return phase+input_size*d->phase_increment; +} + +#else + +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float phase=starting_phase; float cos_start=cos(starting_phase); float sin_start=sin(starting_phase); float cos_vals[4], sin_vals[4]; for(int i=0;idcos[i] - sin_start * d->dsin[i]; - sin_vals[i] = sin_start * d->dcos[i] + cos_start * d->dsin[i]; + cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j]; + sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j]; } - for(int j=0;j<4;j++) + for(int j=0;j<4;j++) //@shift_addfast_cc { iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j); qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j); @@ -299,11 +377,13 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_ cos_start = cos_vals[3]; sin_start = sin_vals[3]; } - return phase; + return phase+input_size*d->phase_increment; } +#endif + #ifdef NEON_OPTS -#pragma message "We have a faster fir_decimate_cc now." +#pragma message "Manual NEON optimizations are ON: we have a faster fir_decimate_cc now." //max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference diff --git a/libcsdr.h b/libcsdr.h index f3e154e..5ccb370 100644 --- a/libcsdr.h +++ b/libcsdr.h @@ -63,6 +63,8 @@ typedef struct complexf_s { float i; float q; } complexf; //they dropped M_PI in C99, so we define it: #define PI ((float)3.14159265358979323846) +#define TIME_TAKEN(start,end) ((end.tv_sec-start.tv_sec)+(end.tv_nsec-start.tv_nsec)/1e9) + //window typedef enum window_s { @@ -160,7 +162,7 @@ typedef struct shift_addfast_data_s { float dsin[4]; float dcos[4]; - + float phase_increment; } shift_addfast_data_t; shift_addfast_data_t shift_addfast_init(float rate); float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase); diff --git a/test200.c b/test200.c index c2166c9..f233d1a 100644 --- a/test200.c +++ b/test200.c @@ -78,7 +78,7 @@ int main() clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); for(int i=0;i