Added NEON optimization for DDC.

Buffer size can now automatically adjust to sampling rate changes between csdr processes.
This commit is contained in:
ha7ilm 2015-09-30 13:52:43 +00:00
parent 6ee80ea7ce
commit 9cec3782e2
5 changed files with 582 additions and 109 deletions

View file

@ -32,7 +32,7 @@ LIBSOURCES = fft_fftw.c libcsdr_wrapper.c
#SOURCES = csdr.c $(LIBSOURCES) #SOURCES = csdr.c $(LIBSOURCES)
cpufeature = $(if $(findstring $(1),$(shell cat /proc/cpuinfo)),$(2)) cpufeature = $(if $(findstring $(1),$(shell cat /proc/cpuinfo)),$(2))
PARAMS_SSE = $(call cpufeature,sse,-msse) $(call cpufeature,sse2,-msse2) $(call cpufeature,sse3,-msse3) $(call cpufeature,sse4,-msse4) $(call cpufeature,sse4_1,-msse4.1) $(call cpufeature,sse4_2,-msse4.2) -mfpmath=sse PARAMS_SSE = $(call cpufeature,sse,-msse) $(call cpufeature,sse2,-msse2) $(call cpufeature,sse3,-msse3) $(call cpufeature,sse4,-msse4) $(call cpufeature,sse4_1,-msse4.1) $(call cpufeature,sse4_2,-msse4.2) -mfpmath=sse
PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0 PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0 -DNEON_OPTS
#tnx Jan Szumiec for the Raspberry Pi support #tnx Jan Szumiec for the Raspberry Pi support
PARAMS_RASPI = -mfloat-abi=hard -mcpu=arm1176jzf-s -mfpu=vfp -funsafe-math-optimizations -Wformat=0 PARAMS_RASPI = -mfloat-abi=hard -mcpu=arm1176jzf-s -mfpu=vfp -funsafe-math-optimizations -Wformat=0
PARAMS_ARM = $(if $(call cpufeature,BCM2708,dummy-text),$(PARAMS_RASPI),$(PARAMS_NEON)) PARAMS_ARM = $(if $(call cpufeature,BCM2708,dummy-text),$(PARAMS_RASPI),$(PARAMS_NEON))
@ -47,12 +47,12 @@ all: clean-vect
@echo NOTE: you may have to manually edit Makefile to optimize for your CPU \(especially if you compile on ARM, please edit PARAMS_NEON\). @echo NOTE: you may have to manually edit Makefile to optimize for your CPU \(especially if you compile on ARM, please edit PARAMS_NEON\).
@echo Auto-detected optimization parameters: $(PARAMS_SIMD) @echo Auto-detected optimization parameters: $(PARAMS_SIMD)
@echo @echo
c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so
-./parsevect dumpvect*.vect -./parsevect dumpvect*.vect
c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr
arm-cross: clean-vect arm-cross: clean-vect
#note: this doesn't work since having added FFTW #note: this doesn't work since having added FFTW
arm-linux-gnueabihf-gcc -std=c99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr arm-linux-gnueabihf-gcc -std=gnu99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr
clean-vect: clean-vect:
rm -f dumpvect*.vect rm -f dumpvect*.vect
clean: clean-vect clean: clean-vect
@ -65,6 +65,8 @@ install:
uninstall: uninstall:
rm /usr/lib/libcsdr.so /usr/bin/csdr /usr/bin/csdr-fm rm /usr/lib/libcsdr.so /usr/bin/csdr /usr/bin/csdr-fm
ldconfig ldconfig
disasm:
objdump -S libcsdr.so > libcsdr.disasm
emcc-clean: emcc-clean:
-rm sdr.js/sdr.js -rm sdr.js/sdr.js
-rm sdr.js/sdrjs-compiled.js -rm sdr.js/sdrjs-compiled.js

530
csdr.c

File diff suppressed because it is too large Load diff

102
libcsdr.c
View file

@ -263,6 +263,75 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
return phase; return phase;
} }
#ifdef NEON_OPTS
#pragma message "We have a faster fir_decimate_cc now."
//max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
//It uses real taps. It returns the number of output samples actually written.
//It needs overlapping input based on its returned value:
//number of processed input samples = returned value * decimation factor
//The output buffer should be at least input_length / 3.
// i: input index | ti: tap index | oi: output index
int oi=0;
for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
{
if(i+taps_length>input_size) break;
register float acci=0;
register float accq=0;
register int ti=0;
register float* pinput=(float*)&(input[i+ti]);
register float* ptaps=taps;
register float* ptaps_end=taps+taps_length;
float quad_acciq [8];
/*
q0, q1: input signal I sample and Q sample
q2: taps
q4, q5: accumulator for I branch and Q branch (will be the output)
*/
//fprintf(stderr, "macska\n");
asm volatile(
//" vorr.f32 q4, #0\n\t" //null the accumulators
//" vorr.f32 q5, #0\n\t"
" vmov.f32 q4, #0.0\n\t" //another way to null the accumulators
" vmov.f32 q5, #0.0\n\t"
"for_fdccasm: vld2.32 {q0-q1}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in q0 and the Q samples in q1), also increment the memory address in pinput (hence the "!" mark) //http://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores
" vld1.32 {q2}, [%[ptaps]]!\n\t"
" vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
" vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
" cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
" bcc for_fdccasm\n\t" // then goto for_fdcasm
" vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
" vst1.32 {q5}, [%[quad_accq]]\n\t"
:
[pinput]"+r"(pinput), [ptaps]"+r"(ptaps) //output operand list
:
[ptaps_end]"r"(ptaps_end), [quad_acci]"r"(quad_acciq), [quad_accq]"r"(quad_acciq+4) //input operand list
:
"memory", "q0", "q1", "q2", "q4", "q5", "cc" //clobber list
);
//original for loops for reference:
//for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
//for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
//for(int n=0;n<8;n++) fprintf(stderr, "\n>> [%d] %g \n", n, quad_acciq[n]);
iof(output,oi)=quad_acciq[0]+quad_acciq[1]+quad_acciq[2]+quad_acciq[3]; //we're still not ready, as we have to add up the contents of a quad accumulator register to get a single accumulated value
qof(output,oi)=quad_acciq[4]+quad_acciq[5]+quad_acciq[6]+quad_acciq[7];
oi++;
}
return oi;
}
#else
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length) int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{ {
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation //Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
@ -286,6 +355,34 @@ int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decim
return oi; return oi;
} }
#endif
/*
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
//It uses real taps. It returns the number of output samples actually written.
//It needs overlapping input based on its returned value:
//number of processed input samples = returned value * decimation factor
//The output buffer should be at least input_length / 3.
// i: input index | ti: tap index | oi: output index
int oi=0;
for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
{
if(i+taps_length>input_size) break;
float acci=0;
int taps_halflength = taps_length/2;
for(int ti=0; ti<taps_halflength; ti++) acci += (iof(input,i+ti)+iof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: i loop
float accq=0;
for(int ti=0; ti<taps_halflength; ti++) accq += (qof(input,i+ti)+qof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: q loop
iof(output,oi)=acci+iof(input,i+taps_halflength)*taps[taps_halflength];
qof(output,oi)=accq+qof(input,i+taps_halflength)*taps[taps_halflength];
oi++;
}
return oi;
}
*/
rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay) rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay)
{ {
@ -524,7 +621,8 @@ float fastdcblock_ff(float* input, float* output, int input_size, float last_dc_
return avg; return avg;
} }
#define FASTAGC_MAX_GAIN (65e3) //#define FASTAGC_MAX_GAIN (65e3)
#define FASTAGC_MAX_GAIN 50
void fastagc_ff(fastagc_ff_t* input, float* output) void fastagc_ff(fastagc_ff_t* input, float* output)
{ {
@ -553,6 +651,7 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
//we change the gain linearly on the apply_block from the last_gain to target_gain. //we change the gain linearly on the apply_block from the last_gain to target_gain.
float target_gain=input->reference/target_peak; float target_gain=input->reference/target_peak;
if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN; if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN;
//fprintf(stderr, "target_gain: %g\n",target_gain);
for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain
{ {
@ -572,7 +671,6 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
//fprintf(stderr,"target_gain=%g\n", target_gain); //fprintf(stderr,"target_gain=%g\n", target_gain);
} }
/* /*
______ __ __ _ _ _ _ ______ __ __ _ _ _ _
| ____| \/ | | | | | | | | | | ____| \/ | | | | | | | | |

View file

@ -189,12 +189,14 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
{ {
if(last_peak<input_abs) if(last_peak<input_abs)
{ {
attack_wait_counter=attack_wait_time; attack_wait_counter=attack_wait_time;
last_peak=input_abs; last_peak=input_abs;
} }
if(attack_wait_counter>0) if(attack_wait_counter>0)
{ {
attack_wait_counter--; attack_wait_counter--;
//fprintf(stderr,"A");
dgain=0; dgain=0;
} }
else else
@ -203,6 +205,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
dgain=error*attack_rate; dgain=error*attack_rate;
//Before starting to increase the gain next time, we will be waiting until hang_time for sure. //Before starting to increase the gain next time, we will be waiting until hang_time for sure.
hang_counter=hang_time; hang_counter=hang_time;
} }
} }
else //DECREASE IN SIGNAL LEVEL else //DECREASE IN SIGNAL LEVEL
@ -222,7 +225,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
} }
//output[i]=gain*input[i]; //Here we do the actual scaling of the samples. //output[i]=gain*input[i]; //Here we do the actual scaling of the samples.
//Here we do the actual scaling of the samples, but we run an IIR filter on the gain values: //Here we do the actual scaling of the samples, but we run an IIR filter on the gain values:
output[i]=(gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1] output[i]=(gain=gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1]
//output[i]=input[i]*(last_gain+gain_filter_alpha*(gain-last_gain)); //LPF //output[i]=input[i]*(last_gain+gain_filter_alpha*(gain-last_gain)); //LPF
last_gain=gain; last_gain=gain;

38
sdr.js/sdrjs-test.html Normal file
View file

@ -0,0 +1,38 @@
<!doctype html>
<!--
This software is part of libcsdr, a set of simple DSP routines for
Software Defined Radio.
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
<html lang="en-us">
<head>
<script type="text/javascript" src="sdr.js"></script>
</head>
<body style="font-family: monospace">
<script type="text/javascript">test_firdes_lowpass_f_new();</script>
</body>
</html>