diff --git a/grc_tests/test_shift_remote.grc b/grc_tests/test_shift_remote.grc
index 516635f..9b4f589 100644
--- a/grc_tests/test_shift_remote.grc
+++ b/grc_tests/test_shift_remote.grc
@@ -69,6 +69,65 @@
       <value></value>
     </param>
   </block>
+  <block>
+    <key>variable_slider</key>
+    <param>
+      <key>comment</key>
+      <value></value>
+    </param>
+    <param>
+      <key>converver</key>
+      <value>float_converter</value>
+    </param>
+    <param>
+      <key>value</key>
+      <value>0</value>
+    </param>
+    <param>
+      <key>_enabled</key>
+      <value>True</value>
+    </param>
+    <param>
+      <key>_coordinate</key>
+      <value>(24, 331)</value>
+    </param>
+    <param>
+      <key>_rotation</key>
+      <value>0</value>
+    </param>
+    <param>
+      <key>grid_pos</key>
+      <value></value>
+    </param>
+    <param>
+      <key>id</key>
+      <value>gen_freq</value>
+    </param>
+    <param>
+      <key>label</key>
+      <value>Frequency:</value>
+    </param>
+    <param>
+      <key>max</key>
+      <value>samp_rate/2</value>
+    </param>
+    <param>
+      <key>min</key>
+      <value>-samp_rate/2</value>
+    </param>
+    <param>
+      <key>notebook</key>
+      <value></value>
+    </param>
+    <param>
+      <key>num_steps</key>
+      <value>100</value>
+    </param>
+    <param>
+      <key>style</key>
+      <value>wx.SL_HORIZONTAL</value>
+    </param>
+  </block>
   <block>
     <key>variable</key>
     <param>
@@ -147,7 +206,7 @@
     </param>
     <param>
       <key>freq</key>
-      <value>20000</value>
+      <value>gen_freq</value>
     </param>
     <param>
       <key>_coordinate</key>
@@ -183,7 +242,7 @@
     </param>
     <param>
       <key>waveform</key>
-      <value>analog.GR_CONST_WAVE</value>
+      <value>analog.GR_SIN_WAVE</value>
     </param>
   </block>
   <block>
diff --git a/grc_tests/test_shift_remote.sh b/grc_tests/test_shift_remote.sh
index 65c7192..14f061d 100755
--- a/grc_tests/test_shift_remote.sh
+++ b/grc_tests/test_shift_remote.sh
@@ -2,7 +2,7 @@
 # Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC. 
 # It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
 TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
-echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
+echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT
 cat $TEMPSCRIPT
 chmod +x $TEMPSCRIPT
 ncat -vvl 5321 -e $TEMPSCRIPT
diff --git a/libcsdr.c b/libcsdr.c
index 64099bb..d0c3b2d 100644
--- a/libcsdr.c
+++ b/libcsdr.c
@@ -295,9 +295,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 	float* pdcos = d->dcos;
 	float* pdsin = d->dsin;
 	register float* pinput = (float*)input;
-	register float* pinput_end = ((float*)input)+input_size;
+	register float* pinput_end = (float*)(input+input_size);
 	register float* poutput = (float*)output;
 
+	//Register map:
 	#define RDCOS "q0" //dcos, dsin
 	#define RDSIN "q1"
 	#define RCOSST "q2" //cos_start, sin_start
@@ -324,7 +325,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		"		vmul.f32 " R3(RCOSV, RCOSST, RDCOS)  //cos_vals[i] = cos_start * d->dcos[i]
 		"		vmls.f32 " R3(RCOSV, RSINST, RDSIN)  //cos_vals[i] -= sin_start * d->dsin[i]
 		"		vmul.f32 " R3(RSINV, RSINST, RDCOS)  //sin_vals[i] = sin_start * d->dcos[i]
-		"		vmla.f32 " R3(RCOSV, RSINST, RDSIN)  //sin_vals[i] += cos_start * d->dsin[i]
+		"		vmla.f32 " R3(RSINV, RCOSST, RDSIN)  //sin_vals[i] += cos_start * d->dsin[i]
 
 		//C version:
 		//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
@@ -334,8 +335,8 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
 		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
 
-		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
-
+		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory
+		"		add %[poutput],%[poutput],#32\n\t"
 		"		vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
 		"		vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
 
@@ -348,7 +349,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 	: 
 		"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
 	);
-
 	return phase+input_size*d->phase_increment;
 }