Fixed shift_addfast_cc/NEON, now it works! Added shift_unroll_cc.

2015-11-29 22:46:06 +00:00 · 2015-11-29 22:46:06 +00:00 · 95ebc0e790
commit 95ebc0e790
parent 965ea631fc
6 changed files with 139 additions and 19 deletions
--- a/csdr.c
+++ b/csdr.c
@ -535,6 +535,56 @@ int main(int argc, char *argv[])
 		return 0;
 	}

+
+	if(!strcmp(argv[1],"shift_unroll_cc"))
+	{
+		bigbufs=1;
+
+		float starting_phase=0;
+		float rate;
+
+		int fd;
+		if(fd=init_fifo(argc,argv))
+		{
+			while(!read_fifo_ctl(fd,"%g\n",&rate)) usleep(10000);
+		}
+		else
+		{
+			if(argc<=2) return badsyntax("need required parameter (rate)"); 
+			sscanf(argv[2],"%g",&rate);
+		}
+
+		if(!sendbufsize(initialize_buffers())) return -2;
+		for(;;)
+		{
+			shift_unroll_data_t data=shift_unroll_init(rate, 1024);
+			fprintf(stderr,"shift_unroll_cc: reinitialized to %g\n",rate);
+			int remain, current_size;
+			float* ibufptr;
+			float* obufptr;
+			for(;;)
+			{
+				FEOF_CHECK;
+				if(!FREAD_C) break;
+				remain=the_bufsize;
+				ibufptr=input_buffer;
+				obufptr=output_buffer;
+				while(remain)
+				{
+					current_size=(remain>1024)?1024:remain;
+					starting_phase=shift_unroll_cc((complexf*)ibufptr, (complexf*)obufptr, current_size, &data, starting_phase);
+					ibufptr+=current_size*2;
+					obufptr+=current_size*2;
+					remain-=current_size;
+				}
+				FWRITE_C;
+				if(read_fifo_ctl(fd,"%g\n",&rate)) break;
+				TRY_YIELD;
+			}
+		}
+		return 0;
+	}
+
 #ifdef LIBCSDR_GPL
 	if(!strcmp(argv[1],"decimating_shift_addition_cc"))
 	{
--- a/grc_tests/test_shift_remote.grc
+++ b/grc_tests/test_shift_remote.grc
@ -355,7 +355,7 @@
    </param>
    <param>
      <key>commandline</key>
-      <value>ncat -vv raspberrypi.local 5321</value>
+      <value>ncat -v raspberrypi.local 5321</value>
    </param>
    <param>
      <key>comment</key>
--- a/grc_tests/test_shift_remote.sh
+++ b/grc_tests/test_shift_remote.sh
@ -2,7 +2,7 @@
 # Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC. 
 # It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
 TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
-echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT
+echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
 cat $TEMPSCRIPT
 chmod +x $TEMPSCRIPT
 ncat -vvl 5321 -e $TEMPSCRIPT
--- a/libcsdr.c
+++ b/libcsdr.c
@ -264,6 +264,44 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
 }


+shift_unroll_data_t shift_unroll_init(float rate, int size)
+{
+	shift_unroll_data_t output;
+	output.phase_increment=2*rate*PI;
+	output.size = size;
+	output.dsin=(float*)malloc(sizeof(float)*size);
+	output.dcos=(float*)malloc(sizeof(float)*size);
+	float myphase = 0;
+	for(int i=0;i<size;i++)
+	{
+		myphase += output.phase_increment;
+		while(myphase>PI) myphase-=2*PI;
+		while(myphase<-PI) myphase+=2*PI;		
+		output.dsin[i]=sin(myphase);
+		output.dcos[i]=cos(myphase);
+	}
+	return output;	
+}
+
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
+{
+	//input_size should be multiple of 4
+	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+	float cos_start=cos(starting_phase);
+	float sin_start=sin(starting_phase);
+	register float cos_val, sin_val;
+	for(int i=0;i<input_size; i++) //@shift_unroll_cc
+	{
+		cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+		sin_val  = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+		iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
+		qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
+	}
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
+}

 shift_addfast_data_t shift_addfast_init(float rate)
 {
@ -283,7 +321,6 @@ shift_addfast_data_t shift_addfast_init(float rate)
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
 {
 	//input_size should be multiple of 4
-	float phase=starting_phase;
 	float cos_start[4], sin_start[4];
 	float cos_vals[4], sin_vals[4];
 	for(int i=0;i<4;i++) 
@ -316,7 +353,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		"		vld1.32	{" RDSIN "}, [%[pdsin]]\n\t"
 		"		vld1.32	{" RCOSST "}, [%[cos_start]]\n\t"
 		"		vld1.32	{" RSINST "}, [%[sin_start]]\n\t"
-		"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark) 
+		"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark) 

 		//C version:
 		//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
@ -330,18 +367,18 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		//C version:
 		//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
 		//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);	
-		"		vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input
-		"		vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input
-		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
-		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
+		"		vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i =  cos_vals * input_i
+		"		vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
+		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q =  sin_vals * input_i
+		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q

-		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory
-		"		add %[poutput],%[poutput],#32\n\t"
-		"		vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
-		"		vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
+		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
+		//"		add %[poutput],%[poutput],#32\n\t"
+		"		vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
+		"		vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]

-		"		cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end)
-		"		bcc for_addfast\n\t"			  //	then goto for_fdcasm
+		"		cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
+		"		bcc for_addfast\n\t"			  //	then goto for_addfast
 	:
 		[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
 	:
@ -349,7 +386,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 	: 
 		"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
 	);
-	return phase+input_size*d->phase_increment;
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
 }

 #else
@ -358,7 +398,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 {
 	//input_size should be multiple of 4
 	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
-	float phase=starting_phase;
 	float cos_start=cos(starting_phase);
 	float sin_start=sin(starting_phase);
 	float cos_vals[4], sin_vals[4];
@ -377,7 +416,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		cos_start = cos_vals[3];
 		sin_start = sin_vals[3];
 	}
-	return phase+input_size*d->phase_increment;
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
 }

 #endif
@ -422,7 +464,7 @@ q4, q5: accumulator for I branch and Q branch (will be the output)
 			"		vld1.32	{q2}, [%[ptaps]]!\n\t" 
 			"		vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
 			"		vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
-			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
+			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
 			"		bcc for_fdccasm\n\t"			//	then goto for_fdcasm
 			"		vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
 			"		vst1.32 {q5}, [%[quad_accq]]\n\t"
--- a/libcsdr.h
+++ b/libcsdr.h
@ -165,8 +165,18 @@ typedef struct shift_addfast_data_s
 	float phase_increment;
 } shift_addfast_data_t;
 shift_addfast_data_t shift_addfast_init(float rate);
+shift_addfast_data_t shift_addfast_init(float rate);
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);

+typedef struct shift_unroll_data_s
+{
+	float* dsin;
+	float* dcos;
+	float phase_increment;
+	int size;
+} shift_unroll_data_t;
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase);
+shift_unroll_data_t shift_unroll_init(float rate, int size);

 int log2n(int x);
 int next_pow2(int x);
--- a/test200.c
+++ b/test200.c
@ -62,9 +62,18 @@ int main()

 	fprintf(stderr,"Starting tests of processing %d samples...\n", T_BUFSIZE*T_N);

+	//shift_math_cc
+	float starting_phase = 0;
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
+	for(int i=0;i<T_N;i++) starting_phase = shift_math_cc(buf_c, outbuf_c, T_BUFSIZE, 0.1, starting_phase);
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
+	fprintf(stderr,"shift_math_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
+
+
 	//shift_addition_cc	
 	shift_addition_data_t data_addition = shift_addition_init(0.1);
-	float starting_phase = 0;
+	starting_phase = 0;

 	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
 	for(int i=0;i<T_N;i++) starting_phase = shift_addition_cc(buf_c, outbuf_c, T_BUFSIZE, data_addition, starting_phase);
@ -80,5 +89,14 @@ int main()
 	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
 	fprintf(stderr,"shift_addfast_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));

+	//shift_unroll_cc
+	shift_unroll_data_t data_unroll = shift_unroll_init(0.1, T_BUFSIZE);
+	starting_phase = 0;
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
+	for(int i=0;i<T_N;i++) starting_phase = shift_unroll_cc(buf_c, outbuf_c, T_BUFSIZE, &data_unroll, starting_phase);
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
+	fprintf(stderr,"shift_unroll_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
+

 }