Added NEON optimization for DDC.
Buffer size can now automatically adjust to sampling rate changes between csdr processes.
This commit is contained in:
parent
6ee80ea7ce
commit
9cec3782e2
5 changed files with 582 additions and 109 deletions
10
Makefile
10
Makefile
|
@ -32,7 +32,7 @@ LIBSOURCES = fft_fftw.c libcsdr_wrapper.c
|
|||
#SOURCES = csdr.c $(LIBSOURCES)
|
||||
cpufeature = $(if $(findstring $(1),$(shell cat /proc/cpuinfo)),$(2))
|
||||
PARAMS_SSE = $(call cpufeature,sse,-msse) $(call cpufeature,sse2,-msse2) $(call cpufeature,sse3,-msse3) $(call cpufeature,sse4,-msse4) $(call cpufeature,sse4_1,-msse4.1) $(call cpufeature,sse4_2,-msse4.2) -mfpmath=sse
|
||||
PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0
|
||||
PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0 -DNEON_OPTS
|
||||
#tnx Jan Szumiec for the Raspberry Pi support
|
||||
PARAMS_RASPI = -mfloat-abi=hard -mcpu=arm1176jzf-s -mfpu=vfp -funsafe-math-optimizations -Wformat=0
|
||||
PARAMS_ARM = $(if $(call cpufeature,BCM2708,dummy-text),$(PARAMS_RASPI),$(PARAMS_NEON))
|
||||
|
@ -47,12 +47,12 @@ all: clean-vect
|
|||
@echo NOTE: you may have to manually edit Makefile to optimize for your CPU \(especially if you compile on ARM, please edit PARAMS_NEON\).
|
||||
@echo Auto-detected optimization parameters: $(PARAMS_SIMD)
|
||||
@echo
|
||||
c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so
|
||||
gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so
|
||||
-./parsevect dumpvect*.vect
|
||||
c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr
|
||||
gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr
|
||||
arm-cross: clean-vect
|
||||
#note: this doesn't work since having added FFTW
|
||||
arm-linux-gnueabihf-gcc -std=c99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr
|
||||
arm-linux-gnueabihf-gcc -std=gnu99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr
|
||||
clean-vect:
|
||||
rm -f dumpvect*.vect
|
||||
clean: clean-vect
|
||||
|
@ -65,6 +65,8 @@ install:
|
|||
uninstall:
|
||||
rm /usr/lib/libcsdr.so /usr/bin/csdr /usr/bin/csdr-fm
|
||||
ldconfig
|
||||
disasm:
|
||||
objdump -S libcsdr.so > libcsdr.disasm
|
||||
emcc-clean:
|
||||
-rm sdr.js/sdr.js
|
||||
-rm sdr.js/sdrjs-compiled.js
|
||||
|
|
102
libcsdr.c
102
libcsdr.c
|
@ -263,6 +263,75 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
|
|||
return phase;
|
||||
}
|
||||
|
||||
#ifdef NEON_OPTS
|
||||
#pragma message "We have a faster fir_decimate_cc now."
|
||||
|
||||
//max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference
|
||||
|
||||
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
|
||||
{
|
||||
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
|
||||
//It uses real taps. It returns the number of output samples actually written.
|
||||
//It needs overlapping input based on its returned value:
|
||||
//number of processed input samples = returned value * decimation factor
|
||||
//The output buffer should be at least input_length / 3.
|
||||
// i: input index | ti: tap index | oi: output index
|
||||
int oi=0;
|
||||
for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
|
||||
{
|
||||
if(i+taps_length>input_size) break;
|
||||
register float acci=0;
|
||||
register float accq=0;
|
||||
|
||||
register int ti=0;
|
||||
register float* pinput=(float*)&(input[i+ti]);
|
||||
register float* ptaps=taps;
|
||||
register float* ptaps_end=taps+taps_length;
|
||||
float quad_acciq [8];
|
||||
|
||||
|
||||
/*
|
||||
q0, q1: input signal I sample and Q sample
|
||||
q2: taps
|
||||
q4, q5: accumulator for I branch and Q branch (will be the output)
|
||||
*/
|
||||
|
||||
//fprintf(stderr, "macska\n");
|
||||
|
||||
asm volatile(
|
||||
//" vorr.f32 q4, #0\n\t" //null the accumulators
|
||||
//" vorr.f32 q5, #0\n\t"
|
||||
" vmov.f32 q4, #0.0\n\t" //another way to null the accumulators
|
||||
" vmov.f32 q5, #0.0\n\t"
|
||||
"for_fdccasm: vld2.32 {q0-q1}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in q0 and the Q samples in q1), also increment the memory address in pinput (hence the "!" mark) //http://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores
|
||||
" vld1.32 {q2}, [%[ptaps]]!\n\t"
|
||||
" vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
|
||||
" vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
|
||||
" cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
|
||||
" bcc for_fdccasm\n\t" // then goto for_fdcasm
|
||||
" vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
|
||||
" vst1.32 {q5}, [%[quad_accq]]\n\t"
|
||||
:
|
||||
[pinput]"+r"(pinput), [ptaps]"+r"(ptaps) //output operand list
|
||||
:
|
||||
[ptaps_end]"r"(ptaps_end), [quad_acci]"r"(quad_acciq), [quad_accq]"r"(quad_acciq+4) //input operand list
|
||||
:
|
||||
"memory", "q0", "q1", "q2", "q4", "q5", "cc" //clobber list
|
||||
);
|
||||
//original for loops for reference:
|
||||
//for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
|
||||
//for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
|
||||
|
||||
//for(int n=0;n<8;n++) fprintf(stderr, "\n>> [%d] %g \n", n, quad_acciq[n]);
|
||||
iof(output,oi)=quad_acciq[0]+quad_acciq[1]+quad_acciq[2]+quad_acciq[3]; //we're still not ready, as we have to add up the contents of a quad accumulator register to get a single accumulated value
|
||||
qof(output,oi)=quad_acciq[4]+quad_acciq[5]+quad_acciq[6]+quad_acciq[7];
|
||||
oi++;
|
||||
}
|
||||
return oi;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
|
||||
{
|
||||
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
|
||||
|
@ -286,6 +355,34 @@ int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decim
|
|||
return oi;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
|
||||
{
|
||||
//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
|
||||
//It uses real taps. It returns the number of output samples actually written.
|
||||
//It needs overlapping input based on its returned value:
|
||||
//number of processed input samples = returned value * decimation factor
|
||||
//The output buffer should be at least input_length / 3.
|
||||
// i: input index | ti: tap index | oi: output index
|
||||
int oi=0;
|
||||
for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
|
||||
{
|
||||
if(i+taps_length>input_size) break;
|
||||
float acci=0;
|
||||
int taps_halflength = taps_length/2;
|
||||
for(int ti=0; ti<taps_halflength; ti++) acci += (iof(input,i+ti)+iof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: i loop
|
||||
float accq=0;
|
||||
for(int ti=0; ti<taps_halflength; ti++) accq += (qof(input,i+ti)+qof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: q loop
|
||||
iof(output,oi)=acci+iof(input,i+taps_halflength)*taps[taps_halflength];
|
||||
qof(output,oi)=accq+qof(input,i+taps_halflength)*taps[taps_halflength];
|
||||
oi++;
|
||||
}
|
||||
return oi;
|
||||
}
|
||||
*/
|
||||
|
||||
rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay)
|
||||
{
|
||||
|
||||
|
@ -524,7 +621,8 @@ float fastdcblock_ff(float* input, float* output, int input_size, float last_dc_
|
|||
return avg;
|
||||
}
|
||||
|
||||
#define FASTAGC_MAX_GAIN (65e3)
|
||||
//#define FASTAGC_MAX_GAIN (65e3)
|
||||
#define FASTAGC_MAX_GAIN 50
|
||||
|
||||
void fastagc_ff(fastagc_ff_t* input, float* output)
|
||||
{
|
||||
|
@ -553,6 +651,7 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
|
|||
//we change the gain linearly on the apply_block from the last_gain to target_gain.
|
||||
float target_gain=input->reference/target_peak;
|
||||
if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN;
|
||||
//fprintf(stderr, "target_gain: %g\n",target_gain);
|
||||
|
||||
for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain
|
||||
{
|
||||
|
@ -572,7 +671,6 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
|
|||
//fprintf(stderr,"target_gain=%g\n", target_gain);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
______ __ __ _ _ _ _
|
||||
| ____| \/ | | | | | | | | |
|
||||
|
|
|
@ -189,12 +189,14 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
|
|||
{
|
||||
if(last_peak<input_abs)
|
||||
{
|
||||
|
||||
attack_wait_counter=attack_wait_time;
|
||||
last_peak=input_abs;
|
||||
}
|
||||
if(attack_wait_counter>0)
|
||||
{
|
||||
attack_wait_counter--;
|
||||
//fprintf(stderr,"A");
|
||||
dgain=0;
|
||||
}
|
||||
else
|
||||
|
@ -203,6 +205,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
|
|||
dgain=error*attack_rate;
|
||||
//Before starting to increase the gain next time, we will be waiting until hang_time for sure.
|
||||
hang_counter=hang_time;
|
||||
|
||||
}
|
||||
}
|
||||
else //DECREASE IN SIGNAL LEVEL
|
||||
|
@ -222,7 +225,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
|
|||
}
|
||||
//output[i]=gain*input[i]; //Here we do the actual scaling of the samples.
|
||||
//Here we do the actual scaling of the samples, but we run an IIR filter on the gain values:
|
||||
output[i]=(gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1]
|
||||
output[i]=(gain=gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1]
|
||||
//output[i]=input[i]*(last_gain+gain_filter_alpha*(gain-last_gain)); //LPF
|
||||
|
||||
last_gain=gain;
|
||||
|
|
38
sdr.js/sdrjs-test.html
Normal file
38
sdr.js/sdrjs-test.html
Normal file
|
@ -0,0 +1,38 @@
|
|||
<!doctype html>
|
||||
<!--
|
||||
This software is part of libcsdr, a set of simple DSP routines for
|
||||
Software Defined Radio.
|
||||
|
||||
Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
-->
|
||||
<html lang="en-us">
|
||||
<head>
|
||||
<script type="text/javascript" src="sdr.js"></script>
|
||||
</head>
|
||||
<body style="font-family: monospace">
|
||||
<script type="text/javascript">test_firdes_lowpass_f_new();</script>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue