/*
This software is part of libcsdr, a set of simple DSP routines for
Software Defined Radio.

Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the copyright holder nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include "libcsdr.h"
#include "predefined.h"
#include <assert.h>

/*
           _           _                   __                  _   _
          (_)         | |                 / _|                | | (_)
 __      ___ _ __   __| | _____      __  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
 \ \ /\ / / | '_ \ / _` |/ _ \ \ /\ / /  |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
  \ V  V /| | | | | (_| | (_) \ V  V /   | | | |_| | | | | (__| |_| | (_) | | | \__ \
   \_/\_/ |_|_| |_|\__,_|\___/ \_/\_/    |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/


*/

#define MFIRDES_GWS(NAME) \
	if(!strcmp( #NAME , input )) return WINDOW_ ## NAME;

window_t firdes_get_window_from_string(char* input)
{
	MFIRDES_GWS(BOXCAR);
	MFIRDES_GWS(BLACKMAN);
	MFIRDES_GWS(HAMMING);
	return WINDOW_DEFAULT;
}

#define MFIRDES_GSW(NAME) \
	if(window == WINDOW_ ## NAME) return #NAME;

char* firdes_get_string_from_window(window_t window)
{
	MFIRDES_GSW(BOXCAR);
	MFIRDES_GSW(BLACKMAN);
	MFIRDES_GSW(HAMMING);
	return "INVALID";
}

float firdes_wkernel_blackman(float rate)
{
	//Explanation at Chapter 16 of dspguide.com, page 2
	//Blackman window has better stopband attentuation and passband ripple than Hamming, but it has slower rolloff.
	rate=0.5+rate/2;
	return 0.42-0.5*cos(2*PI*rate)+0.08*cos(4*PI*rate);
}

float firdes_wkernel_hamming(float rate)
{
	//Explanation at Chapter 16 of dspguide.com, page 2
	//Hamming window has worse stopband attentuation and passband ripple than Blackman, but it has faster rolloff.
	rate=0.5+rate/2;
	return 0.54-0.46*cos(2*PI*rate);
}


float firdes_wkernel_boxcar(float rate)
{	//"Dummy" window kernel, do not use; an unwindowed FIR filter may have bad frequency response
	return 1.0;
}

float (*firdes_get_window_kernel(window_t window))(float)
{
	if(window==WINDOW_HAMMING) return firdes_wkernel_hamming;
	else if(window==WINDOW_BLACKMAN) return firdes_wkernel_blackman;
	else if(window==WINDOW_BOXCAR) return firdes_wkernel_boxcar;
	else return firdes_get_window_kernel(WINDOW_DEFAULT);
}

/*
  ______ _____ _____      __ _ _ _                   _           _
 |  ____|_   _|  __ \    / _(_) | |                 | |         (_)
 | |__    | | | |__) |  | |_ _| | |_ ___ _ __     __| | ___  ___ _  __ _ _ __
 |  __|   | | |  _  /   |  _| | | __/ _ \ '__|   / _` |/ _ \/ __| |/ _` | '_ \
 | |     _| |_| | \ \   | | | | | ||  __/ |     | (_| |  __/\__ \ | (_| | | | |
 |_|    |_____|_|  \_\  |_| |_|_|\__\___|_|      \__,_|\___||___/_|\__, |_| |_|
                                                                    __/ |
                                                                   |___/
*/

void firdes_lowpass_f(float *output, int length, float cutoff_rate, window_t window)
{	//Generates symmetric windowed sinc FIR filter real taps
	//	length should be odd
	//	cutoff_rate is (cutoff frequency/sampling frequency)
	//Explanation at Chapter 16 of dspguide.com
	int middle=length/2;
	float temp;
	float (*window_function)(float)  = firdes_get_window_kernel(window);
	output[middle]=2*PI*cutoff_rate*window_function(0);
	for(int i=1; i<=middle; i++) //@@firdes_lowpass_f: calculate taps
	{
		output[middle-i]=output[middle+i]=(sin(2*PI*cutoff_rate*i)/i)*window_function((float)i/middle);
		//printf("%g %d %d %d %d | %g\n",output[middle-i],i,middle,middle+i,middle-i,sin(2*PI*cutoff_rate*i));
	}

	//Normalize filter kernel
	float sum=0;
	for(int i=0;i<length;i++) //@firdes_lowpass_f: normalize pass 1
	{
		sum+=output[i];
	}
	for(int i=0;i<length;i++) //@firdes_lowpass_f: normalize pass 2
	{
		output[i]/=sum;
	}
}

void firdes_bandpass_c(complexf *output, int length, float lowcut, float highcut, window_t window)
{
	//To generate a complex filter:
	//	1. we generate a real lowpass filter with a bandwidth of highcut-lowcut
	//	2. we shift the filter taps spectrally by multiplying with e^(j*w), so we get complex taps
	//(tnx HA5FT)
	float* realtaps = (float*)malloc(sizeof(float)*length);

	firdes_lowpass_f(realtaps, length, (highcut-lowcut)/2, window);
	float filter_center=(highcut+lowcut)/2;

	float phase=0, sinval, cosval;
	for(int i=0; i<length; i++) //@@firdes_bandpass_c
	{
		cosval=cos(phase);
		sinval=sin(phase);
		phase+=2*PI*filter_center;
		while(phase>2*PI) phase-=2*PI; //@@firdes_bandpass_c
		while(phase<0) phase+=2*PI;
		iof(output,i)=cosval*realtaps[i];
		qof(output,i)=sinval*realtaps[i];
		//output[i] := realtaps[i] * e^j*w
	}
}

int firdes_filter_len(float transition_bw)
{
	int result=4.0/transition_bw;
	if (result%2==0) result++; //number of symmetric FIR filter taps should be odd
	return result;
}

/*
  _____   _____ _____      __                  _   _
 |  __ \ / ____|  __ \    / _|                | | (_)
 | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
 | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
 | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
 |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/

*/

float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
{
	rate*=2;
	//Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
	float phase=starting_phase;
	float phase_increment=rate*PI;
	float cosval, sinval;
	for(int i=0;i<input_size; i++) //@shift_math_cc
	{
		cosval=cos(phase);
		sinval=sin(phase);
		//we multiply two complex numbers.
		//how? enter this to maxima (software) for explanation:
		//   (a+b*%i)*(c+d*%i), rectform;
		iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
		qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
		phase+=phase_increment;
		while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
		while(phase<0) phase+=2*PI;
	}
	return phase;
}


shift_table_data_t shift_table_init(int table_size)
{
	//RTODO
	shift_table_data_t output;
	output.table=(float*)malloc(sizeof(float)*table_size);
	output.table_size=table_size;
	for(int i=0;i<table_size;i++)
	{
		output.table[i]=sin(((float)i/table_size)*(PI/2));
	}
	return output;
}

void shift_table_deinit(shift_table_data_t table_data)
{
	free(table_data.table);
}

float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
{
	//RTODO
	rate*=2;
	//Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
	float phase=starting_phase;
	float phase_increment=rate*PI;
	float cosval, sinval;
	for(int i=0;i<input_size; i++) //@shift_math_cc
	{
		int sin_index, cos_index, temp_index, sin_sign, cos_sign;
		//float vphase=fmodf(phase,PI/2); //between 0 and 90deg
		int quadrant=phase/(PI/2); //between 0 and 3
		float vphase=phase-quadrant*(PI/2);
		sin_index=(vphase/(PI/2))*table_data.table_size;
		cos_index=table_data.table_size-1-sin_index;
		if(quadrant&1) //in quadrant 1 and 3
		{
			temp_index=sin_index;
			sin_index=cos_index;
			cos_index=temp_index;
		}
		sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
		cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
		sinval=sin_sign*table_data.table[sin_index];
		cosval=cos_sign*table_data.table[cos_index];
		//we multiply two complex numbers.
		//how? enter this to maxima (software) for explanation:
		//   (a+b*%i)*(c+d*%i), rectform;
		iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
		qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
		phase+=phase_increment;
		while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
		while(phase<0) phase+=2*PI;
	}
	return phase;
}


shift_unroll_data_t shift_unroll_init(float rate, int size)
{
	shift_unroll_data_t output;
	output.phase_increment=2*rate*PI;
	output.size = size;
	output.dsin=(float*)malloc(sizeof(float)*size);
	output.dcos=(float*)malloc(sizeof(float)*size);
	float myphase = 0;
	for(int i=0;i<size;i++)
	{
		myphase += output.phase_increment;
		while(myphase>PI) myphase-=2*PI;
		while(myphase<-PI) myphase+=2*PI;		
		output.dsin[i]=sin(myphase);
		output.dcos[i]=cos(myphase);
	}
	return output;	
}

float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
{
	//input_size should be multiple of 4
	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
	float cos_start=cos(starting_phase);
	float sin_start=sin(starting_phase);
	register float cos_val, sin_val;
	for(int i=0;i<input_size; i++) //@shift_unroll_cc
	{
		cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
		sin_val  = sin_start * d->dcos[i] + cos_start * d->dsin[i];
		iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
		qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
	}
	starting_phase+=input_size*d->phase_increment;
	while(starting_phase>PI) starting_phase-=2*PI;
	while(starting_phase<-PI) starting_phase+=2*PI;
	return starting_phase;
}

shift_addfast_data_t shift_addfast_init(float rate)
{
	shift_addfast_data_t output;
	output.phase_increment=2*rate*PI;
	for(int i=0;i<4;i++)
	{
		output.dsin[i]=sin(output.phase_increment*(i+1));
		output.dcos[i]=cos(output.phase_increment*(i+1));
	}
	return output;
}

#ifdef NEON_OPTS
#pragma message "Manual NEON optimizations are ON: we have a faster shift_addfast_cc now."

float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
	//input_size should be multiple of 4
	float cos_start[4], sin_start[4];
	float cos_vals[4], sin_vals[4];
	for(int i=0;i<4;i++) 
	{
		cos_start[i] = cos(starting_phase);
		sin_start[i] = sin(starting_phase);
	}

	float* pdcos = d->dcos;
	float* pdsin = d->dsin;
	register float* pinput = (float*)input;
	register float* pinput_end = (float*)(input+input_size);
	register float* poutput = (float*)output;

	//Register map:
	#define RDCOS "q0" //dcos, dsin
	#define RDSIN "q1"
	#define RCOSST "q2" //cos_start, sin_start
	#define RSINST "q3"
	#define RCOSV "q4" //cos_vals, sin_vals
	#define RSINV "q5"
	#define ROUTI "q6" //output_i, output_q
	#define ROUTQ "q7" 
	#define RINPI "q8" //input_i, input_q
	#define RINPQ "q9"
	#define R3(x,y,z) x ", " y ", " z "\n\t"

	asm volatile( //(the range of q is q0-q15)
		"		vld1.32	{" RDCOS "}, [%[pdcos]]\n\t"
		"		vld1.32	{" RDSIN "}, [%[pdsin]]\n\t"
		"		vld1.32	{" RCOSST "}, [%[cos_start]]\n\t"
		"		vld1.32	{" RSINST "}, [%[sin_start]]\n\t"
		"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark) 

		//C version:
		//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
		//sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];

		"		vmul.f32 " R3(RCOSV, RCOSST, RDCOS)  //cos_vals[i] = cos_start * d->dcos[i]
		"		vmls.f32 " R3(RCOSV, RSINST, RDSIN)  //cos_vals[i] -= sin_start * d->dsin[i]
		"		vmul.f32 " R3(RSINV, RSINST, RDCOS)  //sin_vals[i] = sin_start * d->dcos[i]
		"		vmla.f32 " R3(RSINV, RCOSST, RDSIN)  //sin_vals[i] += cos_start * d->dsin[i]

		//C version:
		//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
		//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);	
		"		vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i =  cos_vals * input_i
		"		vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q =  sin_vals * input_i
		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q

		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
		//"		add %[poutput],%[poutput],#32\n\t"
		"		vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
		"		vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]

		"		cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
		"		bcc for_addfast\n\t"			  //	then goto for_addfast
	:
		[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
	:
		[pinput_end]"r"(pinput_end), [pdcos]"r"(pdcos), [pdsin]"r"(pdsin), [sin_start]"r"(sin_start), [cos_start]"r"(cos_start) //input operand list
	: 
		"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
	);
	starting_phase+=input_size*d->phase_increment;
	while(starting_phase>PI) starting_phase-=2*PI;
	while(starting_phase<-PI) starting_phase+=2*PI;
	return starting_phase;
}

#else


#if 1

#define SADF_L1(j) cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
	sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
#define SADF_L2(j) iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
	qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);

float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
	//input_size should be multiple of 4
	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
	float cos_start=cos(starting_phase);
	float sin_start=sin(starting_phase);
	float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
		sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, 
		dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
		dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];

	for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
	{
		SADF_L1(0)
		SADF_L1(1)
		SADF_L1(2)
		SADF_L1(3)
		SADF_L2(0)
		SADF_L2(1)
		SADF_L2(2)
		SADF_L2(3)
		cos_start = cos_vals_3;
		sin_start = sin_vals_3;
	}
	starting_phase+=input_size*d->phase_increment;
	while(starting_phase>PI) starting_phase-=2*PI;
	while(starting_phase<-PI) starting_phase+=2*PI;
	return starting_phase;
}
#else
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
	//input_size should be multiple of 4
	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
	float cos_start=cos(starting_phase);
	float sin_start=sin(starting_phase);
	float cos_vals[4], sin_vals[4];
	for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
	{
		for(int j=0;j<4;j++) //@shift_addfast_cc
		{
			cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
			sin_vals[j] = sin_start * d->dcos[j] + cos_start * d->dsin[j];
		}
		for(int j=0;j<4;j++) //@shift_addfast_cc
		{
			iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
			qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
		}
		cos_start = cos_vals[3];
		sin_start = sin_vals[3];
	}
	starting_phase+=input_size*d->phase_increment;
	while(starting_phase>PI) starting_phase-=2*PI;
	while(starting_phase<-PI) starting_phase+=2*PI;
	return starting_phase;
}
#endif

#endif

#ifdef NEON_OPTS
#pragma message "Manual NEON optimizations are ON: we have a faster fir_decimate_cc now."

//max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference

int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{
	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
	//It uses real taps. It returns the number of output samples actually written.
	//It needs overlapping input based on its returned value:
	//number of processed input samples = returned value * decimation factor
	//The output buffer should be at least input_length / 3.
	// i: input index | ti: tap index | oi: output index
	int oi=0;
	for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
	{
		if(i+taps_length>input_size) break;
		register float acci=0;
		register float accq=0;

		register int ti=0;
		register float* pinput=(float*)&(input[i+ti]);
		register float* ptaps=taps;
		register float* ptaps_end=taps+taps_length;
		float quad_acciq [8];


/*
q0, q1:	input signal I sample and Q sample
q2:		taps
q4, q5: accumulator for I branch and Q branch (will be the output)
*/

		asm volatile(
			"		vmov.f32 q4, #0.0\n\t" //another way to null the accumulators
			"		vmov.f32 q5, #0.0\n\t"
			"for_fdccasm: vld2.32	{q0-q1}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in q0 and the Q samples in q1), also increment the memory address in pinput (hence the "!" mark) //http://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores
			"		vld1.32	{q2}, [%[ptaps]]!\n\t"
			"		vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
			"		vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
			"		bcc for_fdccasm\n\t"			//	then goto for_fdcasm
			"		vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
			"		vst1.32 {q5}, [%[quad_accq]]\n\t"
		:
			[pinput]"+r"(pinput), [ptaps]"+r"(ptaps) //output operand list
		:
			[ptaps_end]"r"(ptaps_end), [quad_acci]"r"(quad_acciq), [quad_accq]"r"(quad_acciq+4) //input operand list
		:
			"memory", "q0", "q1", "q2", "q4", "q5", "cc" //clobber list
		);
		//original for loops for reference:
		//for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
		//for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop

		//for(int n=0;n<8;n++) fprintf(stderr, "\n>> [%d] %g \n", n, quad_acciq[n]);
		iof(output,oi)=quad_acciq[0]+quad_acciq[1]+quad_acciq[2]+quad_acciq[3]; //we're still not ready, as we have to add up the contents of a quad accumulator register to get a single accumulated value
		qof(output,oi)=quad_acciq[4]+quad_acciq[5]+quad_acciq[6]+quad_acciq[7];
		oi++;
	}
	return oi;
}

#else

int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{
	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
	//It uses real taps. It returns the number of output samples actually written.
	//It needs overlapping input based on its returned value:
	//number of processed input samples = returned value * decimation factor
	//The output buffer should be at least input_length / 3.
	// i: input index | ti: tap index | oi: output index
	int oi=0;
	for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
	{
		if(i+taps_length>input_size) break;
		float acci=0;
		for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop
		float accq=0;
		for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
		iof(output,oi)=acci;
		qof(output,oi)=accq;
		oi++;
	}
	return oi;
}

#endif

/*
int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
{
	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
	//It uses real taps. It returns the number of output samples actually written.
	//It needs overlapping input based on its returned value:
	//number of processed input samples = returned value * decimation factor
	//The output buffer should be at least input_length / 3.
	// i: input index | ti: tap index | oi: output index
	int oi=0;
	for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
	{
		if(i+taps_length>input_size) break;
		float acci=0;
		int taps_halflength = taps_length/2;
		for(int ti=0; ti<taps_halflength; ti++) acci += (iof(input,i+ti)+iof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: i loop
		float accq=0;
		for(int ti=0; ti<taps_halflength; ti++) accq += (qof(input,i+ti)+qof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: q loop
		iof(output,oi)=acci+iof(input,i+taps_halflength)*taps[taps_halflength];
		qof(output,oi)=accq+qof(input,i+taps_halflength)*taps[taps_halflength];
		oi++;
	}
	return oi;
}
*/

rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay)
{

	//Theory: http://www.dspguru.com/dsp/faqs/multirate/resampling
	//oi: output index, i: tap index
	int output_size=input_size*interpolation/decimation;
	int oi;
	int startingi, delayi;
	//fprintf(stderr,"rational_resampler_ff | interpolation = %d | decimation = %d\ntaps_length = %d | input_size = %d | output_size = %d | last_taps_delay = %d\n",interpolation,decimation,taps_length,input_size,output_size,last_taps_delay);
	for (oi=0; oi<output_size; oi++) //@rational_resampler_ff (outer loop)
	{
		float acc=0;
		startingi=(oi*decimation+interpolation-1-last_taps_delay)/interpolation; //index of first input item to apply FIR on
		//delayi=startingi*interpolation-oi*decimation; //delay on FIR taps
		delayi=(last_taps_delay+startingi*interpolation-oi*decimation)%interpolation; //delay on FIR taps
		if(startingi+taps_length/interpolation+1>input_size) break; //we can't compute the FIR filter to some input samples at the end
		//fprintf(stderr,"outer loop | oi = %d | startingi = %d | taps delay = %d\n",oi,startingi,delayi);
		for(int i=0; i<(taps_length-delayi)/interpolation; i++)	//@rational_resampler_ff (inner loop)
		{
			//fprintf(stderr,"inner loop | input index = %d | tap index = %d | acc = %g\n",startingi+ii,i,acc);
			acc+=input[startingi+i]*taps[delayi+i*interpolation];
		}
		output[oi]=acc*interpolation;
	}
	rational_resampler_ff_t d;
	d.input_processed=startingi;
	d.output_size=oi;
	d.last_taps_delay=delayi;
	return d;
}

/*

The greatest challenge in resampling is figuring out which tap should be applied to which sample.

Typical test patterns for rational_resampler_ff:

interpolation = 3, decimation = 1
values of [oi, startingi, taps delay] in the outer loop should be:
0 0 0
1 1 2
2 1 1
3 1 0
4 2 2
5 2 1

interpolation = 3, decimation = 2
values of [oi, startingi, taps delay] in the outer loop should be:
0 0 0
1 1 1
2 2 2
3 2 0
4 3 1
5 4 2

*/


void rational_resampler_get_lowpass_f(float* output, int output_size, int interpolation, int decimation, window_t window)
{

	//See 4.1.6 at: http://www.dspguru.com/dsp/faqs/multirate/resampling
	float cutoff_for_interpolation=1.0/interpolation;
	float cutoff_for_decimation=1.0/decimation;
	float cutoff = (cutoff_for_interpolation<cutoff_for_decimation)?cutoff_for_interpolation:cutoff_for_decimation; //get the lower
	firdes_lowpass_f(output, output_size, cutoff/2, window);
}

float inline fir_one_pass_ff(float* input, float* taps, int taps_length)
{
	float acc=0;
	for(int i=0;i<taps_length;i++) acc+=taps[i]*input[i]; //@fir_one_pass_ff
	return acc;
}

old_fractional_decimator_ff_t old_fractional_decimator_ff(float* input, float* output, int input_size, float rate, float *taps, int taps_length, old_fractional_decimator_ff_t d)
{
	if(rate<=1.0) return d; //sanity check, can't decimate <=1.0
	//This routine can handle floating point decimation rates.
	//It linearly interpolates between two samples that are taken into consideration from the filtered input.
	int oi=0;
	int index_high;
	float where=d.remain;
	float result_high, result_low;
	if(where==0.0) //in the first iteration index_high may be zero (so using the item index_high-1 would lead to invalid memory access).
	{
		output[oi++]=fir_one_pass_ff(input,taps,taps_length);
		where+=rate;
	}

	int previous_index_high=-1;
	//we optimize to calculate ceilf(where) only once every iteration, so we do it here:
	for(;(index_high=ceilf(where))+taps_length<input_size;where+=rate) //@fractional_decimator_ff
	{
		if(previous_index_high==index_high-1) result_low=result_high; //if we step less than 2.0 then we do already have the result for the FIR filter for that index
		else result_low=fir_one_pass_ff(input+index_high-1,taps,taps_length);
		result_high=fir_one_pass_ff(input+index_high,taps,taps_length);
		float register rate_between_samples=where-index_high+1;
		output[oi++]=result_low*(1-rate_between_samples)+result_high*rate_between_samples;
		previous_index_high=index_high;
	}

	d.input_processed=index_high-1;
	d.remain=where-d.input_processed;
	d.output_size=oi;
	return d;
}

fractional_decimator_ff_t fractional_decimator_ff_init(float rate, int num_poly_points, float* taps, int taps_length)
{
	fractional_decimator_ff_t d;
	d.num_poly_points = num_poly_points&~1; //num_poly_points needs to be even!
	d.poly_precalc_denomiator = (float*)malloc(d.num_poly_points*sizeof(float));
	//x0..x3
	//-1,0,1,2
	//-(4/2)+1
	//x0..x5
	//-2,-1,0,1,2,3
	d.xifirst=-(num_poly_points/2)+1, d.xilast=num_poly_points/2;
	int id = 0; //index in poly_precalc_denomiator
	for(int xi=d.xifirst;xi<=d.xilast;xi++)
	{
		d.poly_precalc_denomiator[id]=1;
		for(int xj=d.xifirst;xj<=d.xilast;xj++)
		{
			if(xi!=xj) d.poly_precalc_denomiator[id] *= (xi-xj); //poly_precalc_denomiator could be integer as well. But that would later add a necessary conversion.
		}
		id++;
	}
	d.where=-d.xifirst;
	d.coeffs_buf=(float*)malloc(d.num_poly_points*sizeof(float)); 
	d.filtered_buf=(float*)malloc(d.num_poly_points*sizeof(float)); 
	//d.last_inputs_circbuf = (float)malloc(d.num_poly_points*sizeof(float));
	//d.last_inputs_startsat = 0; 
	//d.last_inputs_samplewhere = -1;
	//for(int i=0;i<num_poly_points; i++) d.last_inputs_circbuf[i] = 0;
	d.rate = rate;
	d.taps = taps;
	d.taps_length = taps_length;
	return d;
}

#define DEBUG_ASSERT 1
void fractional_decimator_ff(float* input, float* output, int input_size, fractional_decimator_ff_t* d)
{
	//This routine can handle floating point decimation rates.
	//It applies polynomial interpolation to samples that are taken into consideration from a pre-filtered input.
	//The pre-filter can be switched off by applying taps=NULL.
	//fprintf(stderr, "drate=%f\n", d->rate);
	if(DEBUG_ASSERT) assert(d->rate > 1.0); 
	if(DEBUG_ASSERT) assert(d->where >= -d->xifirst);
	int oi=0; //output index
	int index_high; 
#define FD_INDEX_LOW (index_high-1)
	//we optimize to calculate ceilf(where) only once every iteration, so we do it here:
	for(;(index_high=ceilf(d->where))+d->taps_length<input_size;d->where+=d->rate) //@fractional_decimator_ff
	{
		int sxifirst = FD_INDEX_LOW + d->xifirst; 
		int sxilast = FD_INDEX_LOW + d->xilast; 
		if(d->taps) for(int wi=0;wi<d->num_poly_points;wi++) d->filtered_buf[wi] = fir_one_pass_ff(input+FD_INDEX_LOW+wi, d->taps, d->taps_length);
		else for(int wi=0;wi<d->num_poly_points;wi++) d->filtered_buf[wi] = *(input+FD_INDEX_LOW+wi);
		int id=0;
		float xwhere = d->where - FD_INDEX_LOW;
		for(int xi=d->xifirst;xi<=d->xilast;xi++)
		{
			d->coeffs_buf[id]=1;
			for(int xj=d->xifirst;xj<=d->xilast;xj++)
			{
				if(xi!=xj) d->coeffs_buf[id] *= (xwhere-xj);
			}
			id++;		
		}
		float acc = 0;
		for(int i=0;i<d->num_poly_points;i++)
		{
			acc += (d->coeffs_buf[i]/d->poly_precalc_denomiator[i])*d->filtered_buf[i];  //(xnom/xden)*yn
		}
		output[oi++]=acc;
	}
	d->input_processed = FD_INDEX_LOW + d->xifirst;
	d->where -= d->input_processed + d->xifirst + 1;
	d->output_size = oi;
}

/*
		int last_input_samplewhere_shouldbe = (index_high-1)+xifirst;
		int last_input_offset = last_input_samplewhere_shouldbe - d->last_input_samplewhere;
		if(last_input_offset < num_poly_points)
		{
			//if we can move the last_input circular buffer, we move, and add the new samples at the end
			d->last_inputs_startsat += last_input_offset;
			d->last_inputs_startsat %= num_poly_points;
			int num_copied_samples = 0;
			for(int i=0; i<last_input_offset; i++)
			{
				d->last_inputs_circbuf[i]=
			}
			d->last_input_samplewhere = d->las
		}
*/

void apply_fir_fft_cc(FFT_PLAN_T* plan, FFT_PLAN_T* plan_inverse, complexf* taps_fft, complexf* last_overlap, int overlap_size)
{
	//use the overlap & add method for filtering

	//calculate FFT on input buffer
	fft_execute(plan);

	//multiply the filter and the input
	complexf* in = plan->output;
	complexf* out = plan_inverse->input;

	for(int i=0;i<plan->size;i++) //@apply_fir_fft_cc: multiplication
	{
		iof(out,i)=iof(in,i)*iof(taps_fft,i)-qof(in,i)*qof(taps_fft,i);
		qof(out,i)=iof(in,i)*qof(taps_fft,i)+qof(in,i)*iof(taps_fft,i);
	}

	//calculate inverse FFT on multiplied buffer
	fft_execute(plan_inverse);

	//add the overlap of the previous segment
	complexf* result = plan_inverse->output;

	for(int i=0;i<plan->size;i++) //@apply_fir_fft_cc: normalize by fft_size
	{
		iof(result,i)/=plan->size;
		qof(result,i)/=plan->size;
	}

	for(int i=0;i<overlap_size;i++) //@apply_fir_fft_cc: add overlap
	{
		iof(result,i)=iof(result,i)+iof(last_overlap,i);
		qof(result,i)=qof(result,i)+qof(last_overlap,i);
	}

}

/*
           __  __       _                          _       _       _
     /\   |  \/  |     | |                        | |     | |     | |
    /  \  | \  / |   __| | ___ _ __ ___   ___   __| |_   _| | __ _| |_ ___  _ __ ___
   / /\ \ | |\/| |  / _` |/ _ \ '_ ` _ \ / _ \ / _` | | | | |/ _` | __/ _ \| '__/ __|
  / ____ \| |  | | | (_| |  __/ | | | | | (_) | (_| | |_| | | (_| | || (_) | |  \__ \
 /_/    \_\_|  |_|  \__,_|\___|_| |_| |_|\___/ \__,_|\__,_|_|\__,_|\__\___/|_|  |___/

*/

void amdemod_cf(complexf* input, float *output, int input_size)
{
	//@amdemod: i*i+q*q
	for (int i=0; i<input_size; i++)
	{
		output[i]=iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i);
	}
	//@amdemod: sqrt
	for (int i=0; i<input_size; i++)
	{
		output[i]=sqrt(output[i]);
	}
}

void amdemod_estimator_cf(complexf* input, float *output, int input_size, float alpha, float beta)
{
	//concept is explained here:
	//http://www.dspguru.com/dsp/tricks/magnitude-estimator

	//default: optimize for min RMS error
	if(alpha==0)
	{
		alpha=0.947543636291;
		beta=0.392485425092;
	}

	//@amdemod_estimator
	for (int i=0; i<input_size; i++)
	{
		float abs_i=iof(input,i);
		if(abs_i<0) abs_i=-abs_i;
		float abs_q=qof(input,i);
		if(abs_q<0) abs_q=-abs_q;
		float max_iq=abs_i;
		if(abs_q>max_iq) max_iq=abs_q;
		float min_iq=abs_i;
		if(abs_q<min_iq) min_iq=abs_q;

		output[i]=alpha*max_iq+beta*min_iq;
	}
}

dcblock_preserve_t dcblock_ff(float* input, float* output, int input_size, float a, dcblock_preserve_t preserved)
{
	//after AM demodulation, a DC blocking filter should be used to remove the DC component from the signal.
	//Concept: http://peabody.sapp.org/class/dmp2/lab/dcblock/
	//output size equals to input_size;
	//preserve can be initialized to zero on first run.
	if(a==0) a=0.999; //default value, simulate in octave: freqz([1 -1],[1 -0.99])
	output[0]=input[0]-preserved.last_input+a*preserved.last_output;
	for(int i=1; i<input_size; i++) //@dcblock_f
	{
		output[i]=input[i]-input[i-1]+a*output[i-1];
	}
	preserved.last_input=input[input_size-1];
	preserved.last_output=output[input_size-1];
	return preserved;
}

float fastdcblock_ff(float* input, float* output, int input_size, float last_dc_level)
{
	//this DC block filter does moving average block-by-block.
	//this is the most computationally efficient
	//input and output buffer is allowed to be the same
	//http://www.digitalsignallabs.com/dcblock.pdf
	float avg=0.0;
	for(int i=0;i<input_size;i++) //@fastdcblock_ff: calculate block average
	{
		avg+=input[i];
	}
	avg/=input_size;

	float avgdiff=avg-last_dc_level;
	//DC removal level will change lineraly from last_dc_level to avg.
	for(int i=0;i<input_size;i++) //@fastdcblock_ff: remove DC component
	{
		float dc_removal_level=last_dc_level+avgdiff*((float)i/input_size);
		output[i]=input[i]-dc_removal_level;
	}
	return avg;
}

//#define FASTAGC_MAX_GAIN (65e3)
#define FASTAGC_MAX_GAIN 50

void fastagc_ff(fastagc_ff_t* input, float* output)
{
	//Gain is processed on blocks of samples.
	//You have to supply three blocks of samples before the first block comes out.
	//AGC reaction speed equals input_size*samp_rate*2

	//The algorithm calculates target gain at the end of the first block out of the peak value of all the three blocks.
	//This way the gain change can easily react if there is any peak in the third block.
	//Pros: can be easily speeded up with loop vectorization, easy to implement
	//Cons: needs 3 buffers, dos not behave similarly to real AGC circuits

	//Get the peak value of new input buffer
	float peak_input=0;
	for(int i=0;i<input->input_size;i++) //@fastagc_ff: peak search
	{
		float val=fabs(input->buffer_input[i]);
		if(val>peak_input) peak_input=val;
	}

	//Determine the maximal peak out of the three blocks
	float target_peak=peak_input;
	if(target_peak<input->peak_2) target_peak=input->peak_2;
	if(target_peak<input->peak_1) target_peak=input->peak_1;

	//we change the gain linearly on the apply_block from the last_gain to target_gain.
	float target_gain=input->reference/target_peak;
	if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN;
	//fprintf(stderr, "target_gain: %g\n",target_gain);

	for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain
	{
		float rate=(float)i/input->input_size;
		float gain=input->last_gain*(1.0-rate)+target_gain*rate;
		output[i]=input->buffer_1[i]*gain;
	}

	//Shift the three buffers
	float* temp_pointer=input->buffer_1;
	input->buffer_1=input->buffer_2;
	input->peak_1=input->peak_2;
	input->buffer_2=input->buffer_input;
	input->peak_2=peak_input;
	input->buffer_input=temp_pointer;
	input->last_gain=target_gain;
	//fprintf(stderr,"target_gain=%g\n", target_gain);
}

/*
  ______ __  __        _                          _       _       _
 |  ____|  \/  |      | |                        | |     | |     | |
 | |__  | \  / |    __| | ___ _ __ ___   ___   __| |_   _| | __ _| |_ ___  _ __ ___
 |  __| | |\/| |   / _` |/ _ \ '_ ` _ \ / _ \ / _` | | | | |/ _` | __/ _ \| '__/ __|
 | |    | |  | |  | (_| |  __/ | | | | | (_) | (_| | |_| | | (_| | || (_) | |  \__ \
 |_|    |_|  |_|   \__,_|\___|_| |_| |_|\___/ \__,_|\__,_|_|\__,_|\__\___/|_|  |___/

*/


float fmdemod_atan_cf(complexf* input, float *output, int input_size, float last_phase)
{
	//GCC most likely won't vectorize nor atan, nor atan2.
	//For more comments, look at: https://github.com/simonyiszk/minidemod/blob/master/minidemod-wfm-atan.c
	float phase, dphase;
	for (int i=0; i<input_size; i++) //@fmdemod_atan_novect
	{
		phase=argof(input,i);
		dphase=phase-last_phase;
		if(dphase<-PI) dphase+=2*PI;
		if(dphase>PI) dphase-=2*PI;
		output[i]=dphase/PI;
		last_phase=phase;
	}
	return last_phase;
}

#define fmdemod_quadri_K 0.340447550238101026565118445432744920253753662109375
//this constant ensures proper scaling for qa_fmemod testcases for SNR calculation and more.

complexf fmdemod_quadri_novect_cf(complexf* input, float* output, int input_size, complexf last_sample)
{
	output[0]=fmdemod_quadri_K*(iof(input,0)*(qof(input,0)-last_sample.q)-qof(input,0)*(iof(input,0)-last_sample.i))/(iof(input,0)*iof(input,0)+qof(input,0)*qof(input,0));
	for (int i=1; i<input_size; i++) //@fmdemod_quadri_novect_cf
	{
		float qnow=qof(input,i);
		float qlast=qof(input,i-1);
		float inow=iof(input,i);
		float ilast=iof(input,i-1);
		output[i]=fmdemod_quadri_K*(inow*(qnow-qlast)-qnow*(inow-ilast))/(inow*inow+qnow*qnow);
		//TODO: expression can be simplified as: (qnow*ilast-inow*qlast)/(inow*inow+qnow*qnow)
	}
	return input[input_size-1];
}


complexf fmdemod_quadri_cf(complexf* input, float* output, int input_size, float *temp, complexf last_sample)
{
	float* temp_dq=temp;
	float* temp_di=temp+input_size;

	temp_dq[0]=qof(input,0)-last_sample.q;
	for (int i=1; i<input_size; i++) //@fmdemod_quadri_cf: dq
	{
		temp_dq[i]=qof(input,i)-qof(input,i-1);
	}

	temp_di[0]=iof(input,0)-last_sample.i;
	for (int i=1; i<input_size; i++) //@fmdemod_quadri_cf: di
	{
		temp_di[i]=iof(input,i)-iof(input,i-1);
	}

	for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output numerator
	{
		output[i]=(iof(input,i)*temp_dq[i]-qof(input,i)*temp_di[i]);
	}
	for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output denomiator
	{
		temp[i]=iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i);
	}
	for (int i=0; i<input_size; i++) //@fmdemod_quadri_cf: output division
	{
		output[i]=(temp[i])?fmdemod_quadri_K*output[i]/temp[i]:0;
	}

	return input[input_size-1];
}

inline int is_nan(float f)
{
	//http://stackoverflow.com/questions/570669/checking-if-a-double-or-float-is-nan-in-c
    unsigned u = *(unsigned*)&f;
    return (u&0x7F800000) == 0x7F800000 && (u&0x7FFFFF); // Both NaN and qNan.
}


float deemphasis_wfm_ff (float* input, float* output, int input_size, float tau, int sample_rate, float last_output)
{
	/*
		typical time constant (tau) values:
		WFM transmission in USA: 75 us -> tau = 75e-6
		WFM transmission in EU:  50 us -> tau = 50e-6
		More info at: http://www.cliftonlaboratories.com/fm_receivers_and_de-emphasis.htm
		Simulate in octave: tau=75e-6; dt=1/48000; alpha = dt/(tau+dt); freqz([alpha],[1 -(1-alpha)])
	*/
	float dt = 1.0/sample_rate;
	float alpha = dt/(tau+dt);
	if(is_nan(last_output)) last_output=0.0; //if last_output is NaN
	output[0]=alpha*input[0]+(1-alpha)*last_output;
	for (int i=1;i<input_size;i++) //@deemphasis_wfm_ff
       output[i]=alpha*input[i]+(1-alpha)*output[i-1]; //this is the simplest IIR LPF
   	return output[input_size-1];
}

#define DNFMFF_ADD_ARRAY(x) if(sample_rate==x) { taps=deemphasis_nfm_predefined_fir_##x; taps_length=sizeof(deemphasis_nfm_predefined_fir_##x)/sizeof(float); }

int deemphasis_nfm_ff (float* input, float* output, int input_size, int sample_rate)
{
	/*
		Warning! This only works on predefined samplerates, as it uses fixed FIR coefficients defined in predefined.h
		However, there are the octave commands to generate the taps for your custom (fixed) sample rate.
		What it does:
			- reject below 400 Hz
			- passband between 400 HZ - 4 kHz, but with 20 dB/decade rolloff (for deemphasis)
			- reject everything above 4 kHz
	*/
	float* taps;
	int taps_length=0;

	DNFMFF_ADD_ARRAY(48000)
	DNFMFF_ADD_ARRAY(44100)
	DNFMFF_ADD_ARRAY(8000)
	DNFMFF_ADD_ARRAY(11025)

	if(!taps_length) return 0; //sample rate n
	int i;
	for(i=0;i<input_size-taps_length;i++) //@deemphasis_nfm_ff: outer loop
	{
		float acc=0;
		for(int ti=0;ti<taps_length;ti++) acc+=taps[ti]*input[i+ti]; //@deemphasis_nfm_ff: inner loop
		output[i]=acc;
	}
	return i; //number of samples processed (and output samples)
}

void limit_ff(float* input, float* output, int input_size, float max_amplitude)
{
	for (int i=0; i<input_size; i++) //@limit_ff
	{
		output[i]=(max_amplitude<input[i])?max_amplitude:input[i];
		output[i]=(-max_amplitude>output[i])?-max_amplitude:output[i];
	}
}

void gain_ff(float* input, float* output, int input_size, float gain)
{
	for(int i=0;i<input_size;i++) output[i]=gain*input[i]; //@gain_ff
}

float get_power_f(float* input, int input_size, int decimation)
{
  float acc = 0;
  for(int i=0;i<input_size;i+=decimation)
  {
    acc += (input[i]*input[i])/input_size;
  }
  return acc;
}

float get_power_c(complexf* input, int input_size, int decimation)
{
  float acc = 0;
  for(int i=0;i<input_size;i+=decimation)
  {
    acc += (iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i))/input_size;
  }
  return acc;
}

/*
  __  __           _       _       _
 |  \/  |         | |     | |     | |
 | \  / | ___   __| |_   _| | __ _| |_ ___  _ __ ___
 | |\/| |/ _ \ / _` | | | | |/ _` | __/ _ \| '__/ __|
 | |  | | (_) | (_| | |_| | | (_| | || (_) | |  \__ \
 |_|  |_|\___/ \__,_|\__,_|_|\__,_|\__\___/|_|  |___/

*/

void add_dcoffset_cc(complexf* input, complexf* output, int input_size)
{
	for(int i=0;i<input_size;i++) iof(output,i)=0.5+iof(input,i)/2;
	for(int i=0;i<input_size;i++) qof(output,i)=qof(input,i)/2;
}

float fmmod_fc(float* input, complexf* output, int input_size, float last_phase)
{
	float phase=last_phase;
	for(int i=0;i<input_size;i++)
	{
		phase+=input[i]*PI;
		while(phase>PI) phase-=2*PI;
		while(phase<=-PI) phase+=2*PI;
		iof(output,i)=cos(phase);
		qof(output,i)=sin(phase);
	}
	return phase;
}

void fixed_amplitude_cc(complexf* input, complexf* output, int input_size, float new_amplitude)
{
	for(int i=0;i<input_size;i++)
	{
		//float phase=atan2(iof(input,i),qof(input,i));
		//iof(output,i)=cos(phase)*amp;
		//qof(output,i)=sin(phase)*amp;

		//A faster solution:
		float amplitude_now = sqrt(iof(input,i)*iof(input,i)+qof(input,i)*qof(input,i));
		float gain = (amplitude_now > 0) ? new_amplitude / amplitude_now : 0;
		iof(output,i)=iof(input,i)*gain;
		qof(output,i)=qof(input,i)*gain;
	}
}

/*
  ______        _     ______               _             _______                   __
 |  ____|      | |   |  ____|             (_)           |__   __|                 / _|
 | |__ __ _ ___| |_  | |__ ___  _   _ _ __ _  ___ _ __     | |_ __ __ _ _ __  ___| |_ ___  _ __ _ __ ___
 |  __/ _` / __| __| |  __/ _ \| | | | '__| |/ _ \ '__|    | | '__/ _` | '_ \/ __|  _/ _ \| '__| '_ ` _ \
 | | | (_| \__ \ |_  | | | (_) | |_| | |  | |  __/ |       | | | | (_| | | | \__ \ || (_) | |  | | | | | |
 |_|  \__,_|___/\__| |_|  \___/ \__,_|_|  |_|\___|_|       |_|_|  \__,_|_| |_|___/_| \___/|_|  |_| |_| |_|

*/

int log2n(int x)
{
	int result=-1;
	for(int i=0;i<31;i++)
	{
		if((x>>i)&1) //@@log2n
		{
			if (result==-1) result=i;
			else return -1;
		}
	}
	return result;
}

int next_pow2(int x)
{
	int pow2;
	//portability? (31 is the problem)
	for(int i=0;i<31;i++)
	{
		if(x<(pow2=1<<i)) return pow2; //@@next_pow2
	}
	return -1;
}

void apply_window_c(complexf* input, complexf* output, int size, window_t window)
{
	float (*window_function)(float)=firdes_get_window_kernel(window);
	for(int i=0;i<size;i++) //@apply_window_c
	{
		float rate=(float)i/(size-1);
		iof(output,i)=iof(input,i)*window_function(2.0*rate+1.0);
		qof(output,i)=qof(input,i)*window_function(2.0*rate+1.0);
	}
}

float *precalculate_window(int size, window_t window)
{
	float (*window_function)(float)=firdes_get_window_kernel(window);
	float *windowt;
	windowt = malloc(sizeof(float) * size);
	for(int i=0;i<size;i++) //@precalculate_window
	{
		float rate=(float)i/(size-1);
		windowt[i] = window_function(2.0*rate+1.0);
	}
	return windowt;
}

void apply_precalculated_window_c(complexf* input, complexf* output, int size, float *windowt)
{
	for(int i=0;i<size;i++) //@apply_precalculated_window_c
	{
		iof(output,i)=iof(input,i)*windowt[i];
		qof(output,i)=qof(input,i)*windowt[i];
	}
}


void apply_window_f(float* input, float* output, int size, window_t window)
{
	float (*window_function)(float)=firdes_get_window_kernel(window);
	for(int i=0;i<size;i++) //@apply_window_f
	{
		float rate=(float)i/(size-1);
		output[i]=input[i]*window_function(2.0*rate+1.0);
	}
}

void logpower_cf(complexf* input, float* output, int size, float add_db)
{
	for(int i=0;i<size;i++) output[i]=iof(input,i)*iof(input,i) + qof(input,i)*qof(input,i); //@logpower_cf: pass 1

	for(int i=0;i<size;i++) output[i]=log10(output[i]); //@logpower_cf: pass 2

	for(int i=0;i<size;i++) output[i]=10*output[i]+add_db; //@logpower_cf: pass 3
}

void accumulate_power_cf(complexf* input, float* output, int size)
{
	for(int i=0;i<size;i++) output[i] += iof(input,i)*iof(input,i) + qof(input,i)*qof(input,i); //@logpower_cf: pass 1
	
}

void log_ff(float* input, float* output, int size, float add_db) {
	for(int i=0;i<size;i++) output[i]=log10(input[i]); //@logpower_cf: pass 2

	for(int i=0;i<size;i++) output[i]=10*output[i]+add_db; //@logpower_cf: pass 3
}


/*
  _____        _                                            _
 |  __ \      | |                                          (_)
 | |  | | __ _| |_ __ _    ___ ___  _ ____   _____ _ __ ___ _  ___  _ __
 | |  | |/ _` | __/ _` |  / __/ _ \| '_ \ \ / / _ \ '__/ __| |/ _ \| '_ \
 | |__| | (_| | || (_| | | (_| (_) | | | \ V /  __/ |  \__ \ | (_) | | | |
 |_____/ \__,_|\__\__,_|  \___\___/|_| |_|\_/ \___|_|  |___/_|\___/|_| |_|

*/

void convert_u8_f(unsigned char* input, float* output, int input_size)
{
	for(int i=0;i<input_size;i++) output[i]=((float)input[i])/(UCHAR_MAX/2.0)-1.0; //@convert_u8_f
}

void convert_s8_f(signed char* input, float* output, int input_size)
{
	for(int i=0;i<input_size;i++) output[i]=((float)input[i])/SCHAR_MAX; //@convert_s8_f
}

void convert_s16_f(short* input, float* output, int input_size)
{
	for(int i=0;i<input_size;i++) output[i]=(float)input[i]/SHRT_MAX; //@convert_s16_f
}

void convert_f_u8(float* input, unsigned char* output, int input_size)
{
	for(int i=0;i<input_size;i++) output[i]=input[i]*UCHAR_MAX*0.5+128; //@convert_f_u8
	//128 above is the correct value to add. In any other case a DC component
	//of at least -60 dB is shown on the FFT plot after convert_f_u8 -> convert_u8_f
}

void convert_f_s8(float* input, signed char* output, int input_size)
{
	for(int i=0;i<input_size;i++) output[i]=input[i]*SCHAR_MAX; //@convert_f_s8
}

void convert_f_s16(float* input, short* output, int input_size)
{
	/*for(int i=0;i<input_size;i++)
	{
		if(input[i]>1.0) input[i]=1.0;
		if(input[i]<-1.0) input[i]=-1.0;
	}*/
	for(int i=0;i<input_size;i++) output[i]=input[i]*SHRT_MAX; //@convert_f_s16
}

void convert_i16_f(short* input, float* output, int input_size) { convert_s16_f(input, output, input_size); }
void convert_f_i16(float* input, short* output, int input_size) { convert_f_s16(input, output, input_size); }

void convert_f_s24(float* input, unsigned char* output, int input_size, int bigendian)
{
	int k=0;
	if(bigendian) for(int i=0;i<input_size;i++)
	{
		int temp=input[i]*(INT_MAX>>8);
		unsigned char* ptemp=(unsigned char*)&temp;
		output[k++]=*ptemp;
		output[k++]=*(ptemp+1);
		output[k++]=*(ptemp+2);
	}
	else for(int i=0;i<input_size;i++)
	{
		int temp=input[i]*(INT_MAX>>8);
		unsigned char* ptemp=(unsigned char*)&temp;
		output[k++]=*(ptemp+2);
		output[k++]=*(ptemp+1);
		output[k++]=*ptemp;
	}
}

void convert_s24_f(unsigned char* input, float* output, int input_size, int bigendian)
{
	int k=0;
	if(bigendian) for(int i=0;i<input_size*3;i+=3)
	{
		int temp=(input[i+2]<<24)|(input[i+1]<<16)|(input[i]<<8);
		output[k++]=temp/(float)(INT_MAX-256);
	}
	else for(int i=0;i<input_size*3;i+=3)
	{
		int temp=(input[i+2]<<8)|(input[i+1]<<16)|(input[i]<<24);
		output[k++]=temp/(float)(INT_MAX-256);
	}
}


int trivial_vectorize()
{
	//this function is trivial to vectorize and should pass on both NEON and SSE
	int a[1024], b[1024], c[1024];
	for(int i=0; i<1024; i++) //@trivial_vectorize: should pass :-)
	{
		c[i]=a[i]*b[i];
	}
	return c[0];
}