diff --git a/grc_tests/test_shift_remote.grc b/grc_tests/test_shift_remote.grc
index 516635f..9b4f589 100644
--- a/grc_tests/test_shift_remote.grc
+++ b/grc_tests/test_shift_remote.grc
@@ -69,6 +69,65 @@
+
+ variable_slider
+
+ comment
+
+
+
+ converver
+ float_converter
+
+
+ value
+ 0
+
+
+ _enabled
+ True
+
+
+ _coordinate
+ (24, 331)
+
+
+ _rotation
+ 0
+
+
+ grid_pos
+
+
+
+ id
+ gen_freq
+
+
+ label
+ Frequency:
+
+
+ max
+ samp_rate/2
+
+
+ min
+ -samp_rate/2
+
+
+ notebook
+
+
+
+ num_steps
+ 100
+
+
+ style
+ wx.SL_HORIZONTAL
+
+
variable
@@ -147,7 +206,7 @@
freq
- 20000
+ gen_freq
_coordinate
@@ -183,7 +242,7 @@
waveform
- analog.GR_CONST_WAVE
+ analog.GR_SIN_WAVE
diff --git a/grc_tests/test_shift_remote.sh b/grc_tests/test_shift_remote.sh
index 65c7192..14f061d 100755
--- a/grc_tests/test_shift_remote.sh
+++ b/grc_tests/test_shift_remote.sh
@@ -2,7 +2,7 @@
# Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC.
# It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
-echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
+echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT
cat $TEMPSCRIPT
chmod +x $TEMPSCRIPT
ncat -vvl 5321 -e $TEMPSCRIPT
diff --git a/libcsdr.c b/libcsdr.c
index 64099bb..d0c3b2d 100644
--- a/libcsdr.c
+++ b/libcsdr.c
@@ -295,9 +295,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
float* pdcos = d->dcos;
float* pdsin = d->dsin;
register float* pinput = (float*)input;
- register float* pinput_end = ((float*)input)+input_size;
+ register float* pinput_end = (float*)(input+input_size);
register float* poutput = (float*)output;
+ //Register map:
#define RDCOS "q0" //dcos, dsin
#define RDSIN "q1"
#define RCOSST "q2" //cos_start, sin_start
@@ -324,7 +325,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
" vmul.f32 " R3(RCOSV, RCOSST, RDCOS) //cos_vals[i] = cos_start * d->dcos[i]
" vmls.f32 " R3(RCOSV, RSINST, RDSIN) //cos_vals[i] -= sin_start * d->dsin[i]
" vmul.f32 " R3(RSINV, RSINST, RDCOS) //sin_vals[i] = sin_start * d->dcos[i]
- " vmla.f32 " R3(RCOSV, RSINST, RDSIN) //sin_vals[i] += cos_start * d->dsin[i]
+ " vmla.f32 " R3(RSINV, RCOSST, RDSIN) //sin_vals[i] += cos_start * d->dsin[i]
//C version:
//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
@@ -334,8 +335,8 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
" vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
" vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
- " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
-
+ " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory
+ " add %[poutput],%[poutput],#32\n\t"
" vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
" vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
@@ -348,7 +349,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
:
"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
);
-
return phase+input_size*d->phase_increment;
}