Fixed shift_addfast_cc/NEON, now it works! Added shift_unroll_cc.

This commit is contained in:
ha7ilm 2015-11-29 22:46:06 +00:00
parent 965ea631fc
commit 95ebc0e790
6 changed files with 139 additions and 19 deletions

50
csdr.c
View file

@ -535,6 +535,56 @@ int main(int argc, char *argv[])
return 0; return 0;
} }
if(!strcmp(argv[1],"shift_unroll_cc"))
{
bigbufs=1;
float starting_phase=0;
float rate;
int fd;
if(fd=init_fifo(argc,argv))
{
while(!read_fifo_ctl(fd,"%g\n",&rate)) usleep(10000);
}
else
{
if(argc<=2) return badsyntax("need required parameter (rate)");
sscanf(argv[2],"%g",&rate);
}
if(!sendbufsize(initialize_buffers())) return -2;
for(;;)
{
shift_unroll_data_t data=shift_unroll_init(rate, 1024);
fprintf(stderr,"shift_unroll_cc: reinitialized to %g\n",rate);
int remain, current_size;
float* ibufptr;
float* obufptr;
for(;;)
{
FEOF_CHECK;
if(!FREAD_C) break;
remain=the_bufsize;
ibufptr=input_buffer;
obufptr=output_buffer;
while(remain)
{
current_size=(remain>1024)?1024:remain;
starting_phase=shift_unroll_cc((complexf*)ibufptr, (complexf*)obufptr, current_size, &data, starting_phase);
ibufptr+=current_size*2;
obufptr+=current_size*2;
remain-=current_size;
}
FWRITE_C;
if(read_fifo_ctl(fd,"%g\n",&rate)) break;
TRY_YIELD;
}
}
return 0;
}
#ifdef LIBCSDR_GPL #ifdef LIBCSDR_GPL
if(!strcmp(argv[1],"decimating_shift_addition_cc")) if(!strcmp(argv[1],"decimating_shift_addition_cc"))
{ {

View file

@ -355,7 +355,7 @@
</param> </param>
<param> <param>
<key>commandline</key> <key>commandline</key>
<value>ncat -vv raspberrypi.local 5321</value> <value>ncat -v raspberrypi.local 5321</value>
</param> </param>
<param> <param>
<key>comment</key> <key>comment</key>

View file

@ -2,7 +2,7 @@
# Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC. # Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC.
# It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware. # It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
TEMPSCRIPT="/tmp/test_shift_remote_exec.sh" TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
cat $TEMPSCRIPT cat $TEMPSCRIPT
chmod +x $TEMPSCRIPT chmod +x $TEMPSCRIPT
ncat -vvl 5321 -e $TEMPSCRIPT ncat -vvl 5321 -e $TEMPSCRIPT

View file

@ -264,6 +264,44 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
} }
shift_unroll_data_t shift_unroll_init(float rate, int size)
{
shift_unroll_data_t output;
output.phase_increment=2*rate*PI;
output.size = size;
output.dsin=(float*)malloc(sizeof(float)*size);
output.dcos=(float*)malloc(sizeof(float)*size);
float myphase = 0;
for(int i=0;i<size;i++)
{
myphase += output.phase_increment;
while(myphase>PI) myphase-=2*PI;
while(myphase<-PI) myphase+=2*PI;
output.dsin[i]=sin(myphase);
output.dcos[i]=cos(myphase);
}
return output;
}
float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
{
//input_size should be multiple of 4
//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
float cos_start=cos(starting_phase);
float sin_start=sin(starting_phase);
register float cos_val, sin_val;
for(int i=0;i<input_size; i++) //@shift_unroll_cc
{
cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
}
starting_phase+=input_size*d->phase_increment;
while(starting_phase>PI) starting_phase-=2*PI;
while(starting_phase<-PI) starting_phase+=2*PI;
return starting_phase;
}
shift_addfast_data_t shift_addfast_init(float rate) shift_addfast_data_t shift_addfast_init(float rate)
{ {
@ -283,7 +321,6 @@ shift_addfast_data_t shift_addfast_init(float rate)
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{ {
//input_size should be multiple of 4 //input_size should be multiple of 4
float phase=starting_phase;
float cos_start[4], sin_start[4]; float cos_start[4], sin_start[4];
float cos_vals[4], sin_vals[4]; float cos_vals[4], sin_vals[4];
for(int i=0;i<4;i++) for(int i=0;i<4;i++)
@ -316,7 +353,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
" vld1.32 {" RDSIN "}, [%[pdsin]]\n\t" " vld1.32 {" RDSIN "}, [%[pdsin]]\n\t"
" vld1.32 {" RCOSST "}, [%[cos_start]]\n\t" " vld1.32 {" RCOSST "}, [%[cos_start]]\n\t"
" vld1.32 {" RSINST "}, [%[sin_start]]\n\t" " vld1.32 {" RSINST "}, [%[sin_start]]\n\t"
"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark) "for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark)
//C version: //C version:
//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j]; //cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
@ -330,18 +367,18 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
//C version: //C version:
//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j); //iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j); //qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
" vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input " vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i = cos_vals * input_i
" vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input " vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
" vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i] " vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q = sin_vals * input_i
" vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i] " vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q
" vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
" add %[poutput],%[poutput],#32\n\t" //" add %[poutput],%[poutput],#32\n\t"
" vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3] " vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
" vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3] " vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]
" cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end) " cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
" bcc for_addfast\n\t" // then goto for_fdcasm " bcc for_addfast\n\t" // then goto for_addfast
: :
[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM [pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
: :
@ -349,7 +386,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
: :
"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
); );
return phase+input_size*d->phase_increment; starting_phase+=input_size*d->phase_increment;
while(starting_phase>PI) starting_phase-=2*PI;
while(starting_phase<-PI) starting_phase+=2*PI;
return starting_phase;
} }
#else #else
@ -358,7 +398,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
{ {
//input_size should be multiple of 4 //input_size should be multiple of 4
//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
float phase=starting_phase;
float cos_start=cos(starting_phase); float cos_start=cos(starting_phase);
float sin_start=sin(starting_phase); float sin_start=sin(starting_phase);
float cos_vals[4], sin_vals[4]; float cos_vals[4], sin_vals[4];
@ -377,7 +416,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
cos_start = cos_vals[3]; cos_start = cos_vals[3];
sin_start = sin_vals[3]; sin_start = sin_vals[3];
} }
return phase+input_size*d->phase_increment; starting_phase+=input_size*d->phase_increment;
while(starting_phase>PI) starting_phase-=2*PI;
while(starting_phase<-PI) starting_phase+=2*PI;
return starting_phase;
} }
#endif #endif
@ -422,7 +464,7 @@ q4, q5: accumulator for I branch and Q branch (will be the output)
" vld1.32 {q2}, [%[ptaps]]!\n\t" " vld1.32 {q2}, [%[ptaps]]!\n\t"
" vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html " vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
" vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1 " vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
" cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end) " cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
" bcc for_fdccasm\n\t" // then goto for_fdcasm " bcc for_fdccasm\n\t" // then goto for_fdcasm
" vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory " vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
" vst1.32 {q5}, [%[quad_accq]]\n\t" " vst1.32 {q5}, [%[quad_accq]]\n\t"

View file

@ -165,8 +165,18 @@ typedef struct shift_addfast_data_s
float phase_increment; float phase_increment;
} shift_addfast_data_t; } shift_addfast_data_t;
shift_addfast_data_t shift_addfast_init(float rate); shift_addfast_data_t shift_addfast_init(float rate);
shift_addfast_data_t shift_addfast_init(float rate);
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase); float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
typedef struct shift_unroll_data_s
{
float* dsin;
float* dcos;
float phase_increment;
int size;
} shift_unroll_data_t;
float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase);
shift_unroll_data_t shift_unroll_init(float rate, int size);
int log2n(int x); int log2n(int x);
int next_pow2(int x); int next_pow2(int x);

View file

@ -62,9 +62,18 @@ int main()
fprintf(stderr,"Starting tests of processing %d samples...\n", T_BUFSIZE*T_N); fprintf(stderr,"Starting tests of processing %d samples...\n", T_BUFSIZE*T_N);
//shift_math_cc
float starting_phase = 0;
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for(int i=0;i<T_N;i++) starting_phase = shift_math_cc(buf_c, outbuf_c, T_BUFSIZE, 0.1, starting_phase);
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
fprintf(stderr,"shift_math_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
//shift_addition_cc //shift_addition_cc
shift_addition_data_t data_addition = shift_addition_init(0.1); shift_addition_data_t data_addition = shift_addition_init(0.1);
float starting_phase = 0; starting_phase = 0;
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for(int i=0;i<T_N;i++) starting_phase = shift_addition_cc(buf_c, outbuf_c, T_BUFSIZE, data_addition, starting_phase); for(int i=0;i<T_N;i++) starting_phase = shift_addition_cc(buf_c, outbuf_c, T_BUFSIZE, data_addition, starting_phase);
@ -80,5 +89,14 @@ int main()
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time); clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
fprintf(stderr,"shift_addfast_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time)); fprintf(stderr,"shift_addfast_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
//shift_unroll_cc
shift_unroll_data_t data_unroll = shift_unroll_init(0.1, T_BUFSIZE);
starting_phase = 0;
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for(int i=0;i<T_N;i++) starting_phase = shift_unroll_cc(buf_c, outbuf_c, T_BUFSIZE, &data_unroll, starting_phase);
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
fprintf(stderr,"shift_unroll_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
} }