/* -*- c++ -*- */
/*
 * Copyright 2005 Free Software Foundation, Inc.
 * 
 * This file is part of GNU Radio
 * 
 * GNU Radio is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 * 
 * GNU Radio is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with GNU Radio; see the file COPYING.  If not, write to
 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <iostream>
#include <string>
#include <fstream>
#include <unistd.h>
#include <stdlib.h>
#include <getopt.h>
#include <boost/scoped_array.hpp>
#include <gr_complex.h>
#include <gr_fxpt_nco.h>
#include "time_series.h"

#include <gri_fft.h>

#include <string.h>

gr_complex
complex_conj_dotprod(const gr_complex *a, const gr_complex *b, unsigned n)
{
  gr_complex acc = 0;
  for (unsigned i = 0; i < n; i++)
    acc += a[i] * conj(b[i]);

  return acc;
}

/*!
 * \brief frequency translate src by normalized_freq
 *
 * \param dst			destination
 * \param src			source
 * \param n			length of src and dst in samples
 * \param normalized_freq	[-1/2, +1/2]
 * \param start_phase
 * \return end_phase
 */
float
freq_xlate(gr_complex *dst, const gr_complex *src, unsigned n, float normalized_freq, float start_phase)
{
  gr_fxpt_nco	nco;
  nco.set_freq(2 * M_PI * normalized_freq);
  nco.set_phase(start_phase);

  for (unsigned int i = 0; i < n; i++){
    gr_complex	phasor(nco.cos(), nco.sin());
    dst[i] = src[i] * phasor;
    nco.step();
  }
  return nco.get_phase();//return end_phase
}

inline void
write_float(FILE *output, float x)
{
  fwrite(&x, sizeof(x), 1, output);
}

void
write_floats(FILE *output, float *x,int n)
{
  fwrite(x, sizeof(x[0]), n, output);
}

// write 8-float header
static void
write_header (FILE *output, float ndoppler_bins, float min_doppler, float max_doppler)
{
  write_float(output, ndoppler_bins);
  write_float(output, min_doppler);
  write_float(output, max_doppler);
  write_float(output, 0);
  write_float(output, 0);
  write_float(output, 0);
  write_float(output, 0);
  write_float(output, 0);
}

void calculate_wiener_filter(int N,gr_complex *estimated_blur_spectrum,double sigma /*noise power*/, gr_complex * result, double alpha=1.0 /*noise scale factor*/)
{
  gr_complex *Hf=estimated_blur_spectrum;
  const float sigma_2=alpha*sigma*sigma;

  for(int i=0;i<N;i++)
  {
    result[i] = conj(Hf[i])/(abs(Hf[i]*Hf[i])+sigma_2);
  }

  return;


}

void apply_fft_filter(int N,gr_complex * signal, gr_complex * spectrum, gri_fft_complex *fwdfft,gri_fft_complex *invfft, gr_complex * result)
{
  memcpy(fwdfft->get_inbuf(), signal, N *sizeof(gr_complex));
  fwdfft->execute();	// compute fwd xform
  gr_complex * Yf=fwdfft->get_outbuf();
  gr_complex * eXf=invfft->get_inbuf();
  for(int i=0;i<N;i++)
  {
    eXf[i]=spectrum[i]*Yf[i];
  }
  invfft->execute();	// compute fwd xform
  memcpy(result, invfft->get_outbuf(), N *sizeof(gr_complex));
}

void
autocorrelate_fft_overlap_add(gr_complex *output, time_series &ts, 
	  unsigned nranges, unsigned correlation_window_size
	  )
{
  //Do auto-correlation and output the first nlags  
  //Implemented by overlap-add fft method

  int fftsize;//Most efficient when this number can be factored into small primes
  fftsize=2;
  while(((unsigned)fftsize)<nranges*8)
    fftsize=fftsize*2;

  float scale_factor ;//= 1.0;///fftsize;
  gr_complex *out=new gr_complex[nranges];
  gr_complex* shifted(new gr_complex[fftsize]);
  const gr_complex *ref = (const gr_complex *) ts.seek(0, correlation_window_size);//autocorrelate
  const gr_complex *scat0 = ref;

  gri_fft_complex	  *fwdfft_shifted_ref;		// forward "plan"
  gri_fft_complex	  *fwdfft_scat;		// forward "plan"
  gri_fft_complex	  *invfft;		// inverse "plan"

  fwdfft_shifted_ref = new gri_fft_complex(fftsize, true);
  fwdfft_scat = new gri_fft_complex(fftsize, true);
  invfft = new gri_fft_complex(fftsize, false);

  int j = 0;

  for (j = 0; j < fftsize; j++)
  {
    fwdfft_shifted_ref->get_inbuf()[j] = 0;//Clear input of all ffts. Should only need to do this once
    fwdfft_scat->get_inbuf()[j] = 0;
    invfft->get_inbuf()[j]=0;//This is really needed
    shifted[j] = 0;//Clear array of inputs for inv_fft
  }

  gr_complex *a = fwdfft_shifted_ref->get_outbuf();
  gr_complex *b = fwdfft_scat->get_outbuf();
  gr_complex *c;// = invfft->get_inbuf();

  int offset_ref=(int)nranges;
  //Overlap-add fft implementation of correlation, outputting first nranges of correlation
  //Zero-padding is  nranges in front and back of ref-fft
  int offset_scat=0;
  for(int f=0;offset_scat<(int)correlation_window_size;f++,offset_scat+=fftsize-2*(int)nranges)
  {
    memcpy(&fwdfft_scat->get_inbuf()[0], & scat0[offset_scat], fftsize * 1 *sizeof(gr_complex));//full fft is filled, no zero padding
    fwdfft_scat->execute();	// compute fwd xform
    //we only fill the middle of fwdfft_shifted_ref, so there is zero padding in first and last nranges
    memcpy(&fwdfft_shifted_ref->get_inbuf()[nranges], &ref[offset_ref], (fftsize-2*nranges)*1 *sizeof(gr_complex));// reference, no shift since ndoppler=1
    fwdfft_shifted_ref->execute();	// compute fwd xform
    c=&shifted[0];
    for (j = 0; j < fftsize; j++)	// cross-correlate in the freq domain
      c[j] = c[j]+a[j] * conj(b[j]);//full integrator
      //c[j] = alpha*c[j]+(1.0-alpha)*a[j] * conj(b[j]);//moving average
    offset_ref+=fftsize-2*(int)nranges;
  } /* loop over nffts */

  memcpy(&invfft->get_inbuf()[0], & shifted[0], fftsize *sizeof(gr_complex));
  invfft->execute();
  gr_complex *unscaled_out=invfft->get_outbuf();
  scale_factor=float(offset_scat+nranges);//TODO Check this:offset_scat+nranges
  for (j = 0; j < fftsize; j++)
  {
    output[j]=conj(unscaled_out[j]/scale_factor);//TODO Check this
  }
  //memcpy(output,invfft->get_outbuf(), fftsize *sizeof(gr_complex));
  delete [] out;
  delete fwdfft_shifted_ref;
  delete fwdfft_scat;
  delete invfft;
  delete shifted;

}


void
main_loop_fft_overlap_add(FILE *output, time_series &ref_ts, time_series &scat0_ts,
	  unsigned nranges, unsigned correlation_window_size,
	  float min_doppler, float max_doppler, int ndoppler_bins, unsigned int skip)
{
  //Do cross-correlation and output the first nlags
  //Implemented by overlap-add fft method
  fprintf(stderr, "ndoppler_bins = %10d\n", ndoppler_bins);
  fprintf(stderr, "min_doppler   = %10f\n", min_doppler);
  fprintf(stderr, "max_doppler   = %10f\n", max_doppler);


  int fftsize;//Most efficient when this number can be factored into small primes
  fftsize=2;
  while(((unsigned)fftsize)<nranges*8)
    fftsize=fftsize*2;
  fprintf(stderr, "fftsize   = %i\n", fftsize);
  //int nffts=(int)(0.5+float(correlation_window_size)/float(fftsize));
  float scale_factor = 1.0;///fftsize;
  //boost::scoped_array<gr_complex>  shifted(new gr_complex[correlation_window_size]);
  boost::scoped_array<float>  phasedop(new float[ndoppler_bins]);
  float *out=new float[nranges*ndoppler_bins];
  gr_complex* shifted(new gr_complex[fftsize*ndoppler_bins]);
  const gr_complex *ref = (const gr_complex *) ref_ts.seek(0, correlation_window_size);//crosscorrelate
  //const gr_complex *ref = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);//autocorrelate
  const gr_complex *scat0 = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);

  gri_fft_complex	  *fwdfft_shifted_ref;		// forward "plan"
  gri_fft_complex	  *fwdfft_scat;		// forward "plan"
  gri_fft_complex	  *invfft;		// inverse "plan"

  fwdfft_shifted_ref = new gri_fft_complex(fftsize, true);
  fwdfft_scat = new gri_fft_complex(fftsize, true);
  invfft = new gri_fft_complex(fftsize, false);
 
  float doppler_incr = 0;
  if (ndoppler_bins == 1){
    min_doppler = 0;
    max_doppler = 0;
  }
  else
    doppler_incr = (max_doppler - min_doppler) / (ndoppler_bins - 1);

  write_header(output, ndoppler_bins, min_doppler, max_doppler);
  int j = 0;

  for (j = 0; j < fftsize; j++)
  {
    fwdfft_shifted_ref->get_inbuf()[j] = 0;//Clear input of all ffts. Should only need to do this once
    fwdfft_scat->get_inbuf()[j] = 0;
    invfft->get_inbuf()[j]=0;//This is really needed
  }
  int nd;
  for (nd = 0; nd < ndoppler_bins; nd++)
    phasedop[nd]=0.0;//startphase is zero
  for (j = 0; j < ndoppler_bins*fftsize; j++)
  {
    shifted[j] = 0;//Clear array of inputs for inv_fft
  }
  gr_complex *a = fwdfft_shifted_ref->get_outbuf();
  gr_complex *b = fwdfft_scat->get_outbuf();
  gr_complex *c;// = invfft->get_inbuf();

  //int offset_scat=0;
  int offset_ref=(int)nranges;
  //Overlap-add fft implementation of correlation, outputting first nranges of correlation
  //Zero-padding is  nranges in front and back of ref-fft
  for(int f=0,offset_scat=0;offset_scat<(int)correlation_window_size;f++,offset_scat+=fftsize-2*(int)nranges)
  {
    memcpy(&fwdfft_scat->get_inbuf()[0], & scat0[offset_scat], fftsize * 1 *sizeof(gr_complex));//full fft is filled, no zero padding
    fwdfft_scat->execute();	// compute fwd xform
    for (int nd = 0; nd < ndoppler_bins; nd++){
      float fdop = min_doppler + doppler_incr * nd;
      //we only fill the middle of fwdfft_shifted_ref, so there is zero padding in first and last nranges
      phasedop[nd]=freq_xlate(&fwdfft_shifted_ref->get_inbuf()[nranges], &ref[offset_ref], (fftsize-2*nranges)*1, fdop,phasedop[nd]);	// generated doppler shifted reference
      //freq_xlate(&fwdfft_shifted_ref->get_inbuf()[0], ref, correlation_window_size*1, fdop);	// generated doppler shifted reference

      fwdfft_shifted_ref->execute();	// compute fwd xform
      c=&shifted[nd*fftsize];
      for (j = 0; j < fftsize; j++)	// cross-correlate in the freq domain
        c[j] = c[j]+a[j] * conj(b[j]);//full integrator
        //c[j] = alpha*c[j]+(1.0-alpha)*a[j] * conj(b[j]);//moving average

    } /*loop over ndoppler_bins*/
    offset_ref+=fftsize-2*(int)nranges;
  } /* loop over nffts */


  for (nd = 0; nd < ndoppler_bins; nd++)
  {
      memcpy(&invfft->get_inbuf()[0], & shifted[nd*fftsize], fftsize * 1 *sizeof(gr_complex));
      invfft->execute();
      gr_complex *unscaled_out=invfft->get_outbuf();
      int fft_bin;
      for (j = 0; j < (int)nranges; j++)
      {
          //xcp = [ xcpw(end-nlags+1:end);xcpw(1:nlags+1) ];
          if (j<=0)
            fft_bin=-j;
          else
            fft_bin=fftsize-j;//+1 ??
          //for a complex output we should take the conjugate, but since we take the abs, we don't need this
          double curr_norm=(((unsigned)abs(j))<skip)?0.0:norm(unscaled_out[fft_bin]) * scale_factor;
          out[j*ndoppler_bins+nd]=curr_norm;//skip>0 don't show direct path peak around zero
          //out[j*ndoppler_bins+nd]=norm(unscaled_out[j]) * scale_factor;
      } /* loop over nranges */
  }/* second loop over ndoppler_bins */
  write_floats(output,out,ndoppler_bins*nranges);

  delete [] out;
  delete fwdfft_shifted_ref;
  delete fwdfft_scat;
  delete invfft;
  delete shifted;

}

void
main_loop_nodoppler_xy_fft_overlap_add(FILE *output, time_series &ref_ts, time_series &scat0_ts,
	  unsigned nranges, unsigned correlation_window_size,
	  float min_doppler, float max_doppler, int ndoppler_bins, unsigned int skip)
{
  //Do cross-correlation and output the first nlags in xy format (x,y result from range and phase, brightness from magnitude) 
  //Implemented by overlap-add fft method

  if(ndoppler_bins>1) 
  {
    fprintf(stderr, "ERROR ndoppler_bins > 1 %10d\n", ndoppler_bins);
    return;
  }
  int fftsize;//Most efficient when this number can be factored into small primes
  fftsize=2;
  while(((unsigned)fftsize)<nranges*8)
    fftsize=fftsize*2;
  fprintf(stderr, "fftsize   = %i\n", fftsize);

  float scale_factor = 1.0;///fftsize;
  //boost::scoped_array<gr_complex>  shifted(new gr_complex[correlation_window_size]);
  float *out_xy=new float[nranges*nranges];
  gr_complex* shifted(new gr_complex[fftsize]);
  const gr_complex *ref = (const gr_complex *) ref_ts.seek(0, correlation_window_size);//crosscorrelate
  //const gr_complex *ref = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);//autocorrelate
  const gr_complex *scat0 = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);

  gri_fft_complex	  *fwdfft_shifted_ref;		// forward "plan"
  gri_fft_complex	  *fwdfft_scat;		// forward "plan"
  gri_fft_complex	  *invfft;		// inverse "plan"

  fwdfft_shifted_ref = new gri_fft_complex(fftsize, true);
  fwdfft_scat = new gri_fft_complex(fftsize, true);
  invfft = new gri_fft_complex(fftsize, false);

  //write_header(output, ndoppler_bins, min_doppler, max_doppler);
  write_header(output, nranges, 0, 0);
  int j = 0;

  for (j = 0; j < (int)(nranges*nranges); j++)
  {
       out_xy[j]=0;
  }
  for (j = 0; j < fftsize; j++)
  {
    fwdfft_shifted_ref->get_inbuf()[j] = 0;//Clear input of all ffts. Should only need to do this once
    fwdfft_scat->get_inbuf()[j] = 0;
    invfft->get_inbuf()[j]=0;//This is really needed
    shifted[j] = 0;//Clear array of inputs for inv_fft
  }

  gr_complex *a = fwdfft_shifted_ref->get_outbuf();
  gr_complex *b = fwdfft_scat->get_outbuf();
  gr_complex *c;// = invfft->get_inbuf();

  int offset_ref=(int)nranges;
  //Overlap-add fft implementation of correlation, outputting first nranges of correlation
  //Zero-padding is  nranges in front and back of ref-fft
  for(int f=0,offset_scat=0;offset_scat<(int)correlation_window_size;f++,offset_scat+=fftsize-2*(int)nranges)
  {
    memcpy(&fwdfft_scat->get_inbuf()[0], & scat0[offset_scat], fftsize * 1 *sizeof(gr_complex));//full fft is filled, no zero padding
    fwdfft_scat->execute();	// compute fwd xform
    //we only fill the middle of fwdfft_shifted_ref, so there is zero padding in first and last nranges
    memcpy(&fwdfft_shifted_ref->get_inbuf()[nranges], &ref[offset_ref], (fftsize-2*nranges)*1 *sizeof(gr_complex));// reference, no shift since ndoppler=1
    fwdfft_shifted_ref->execute();	// compute fwd xform
    c=&shifted[0];
    for (j = 0; j < fftsize; j++)	// cross-correlate in the freq domain
      c[j] = c[j]+a[j] * conj(b[j]);//full integrator
      //c[j] = alpha*c[j]+(1.0-alpha)*a[j] * conj(b[j]);//moving average
    offset_ref+=fftsize-2*(int)nranges;
  } /* loop over nffts */

  memcpy(&invfft->get_inbuf()[0], & shifted[0], fftsize * 1 *sizeof(gr_complex));
  invfft->execute();
  gr_complex *unscaled_out=invfft->get_outbuf();
  int fft_bin;
  for (j = 0; j < (int)nranges; j++)
  {
      //xcp = [ xcpw(end-nlags+1:end);xcpw(1:nlags+1) ];
      if (j<=0)
        fft_bin=-j;
      else
        fft_bin=fftsize-j;//+1 ??
      gr_complex curr_val=conj(unscaled_out[fft_bin]);
      float curr_norm=(((unsigned)abs(j))<skip)?0.0:abs(curr_val);
      //double curr_phase=arg(curr_val);
      //out[j*ndoppler_bins+nd]=curr_norm;//skip>0 don't show direct path peak around zero
      gr_complex xy = (curr_norm>0)?curr_val/curr_norm:0;
      float x= float(j)*(xy.real()*0.5)+float(nranges)*0.5;
      float y= float(j)*(xy.imag()*0.5)+float(nranges)*0.5;
      //double x=double(j)*(cos(curr_phase)+1.0)*0.5;//x-range is 0 to nranges-1
      //double y=double(j)*(sin(curr_phase)+1.0)*0.5;//y-range is 0 to nranges-1
      out_xy[int(y+0.5)*nranges+int(x+0.5)]=curr_norm * scale_factor;
      //out[j*ndoppler_bins+nd]=norm(unscaled_out[j]) * scale_factor;
  } /* loop over nranges */

  write_floats(output,out_xy,nranges*nranges);

  delete [] out_xy;
  delete fwdfft_shifted_ref;
  delete fwdfft_scat;
  delete invfft;
  delete shifted;

}

void
main_loop_doppler_xy_fft_overlap_add(FILE *output, time_series &ref_ts, time_series &scat0_ts,
	  unsigned nranges, unsigned correlation_window_size,
	  float doppler, unsigned int skip)
{
  //Do cross-correlation and output the first nlags in xy format (x,y result from range and phase, brightness from magnitude) 
  //Implemented by overlap-add fft method

  int fftsize;//Most efficient when this number can be factored into small primes
  fftsize=2;
  while(((unsigned)fftsize)<nranges*8)
    fftsize=fftsize*2;
  fprintf(stderr, "fftsize   = %i\n", fftsize);

  float scale_factor = 1.0;///fftsize;
  //boost::scoped_array<gr_complex>  shifted(new gr_complex[correlation_window_size]);
  float *out_xy=new float[nranges*nranges];
  gr_complex* shifted(new gr_complex[fftsize]);
  const gr_complex *ref = (const gr_complex *) ref_ts.seek(0, correlation_window_size);//crosscorrelate
  //const gr_complex *ref = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);//autocorrelate
  const gr_complex *scat0 = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);

  gri_fft_complex	  *fwdfft_shifted_ref;		// forward "plan"
  gri_fft_complex	  *fwdfft_scat;		// forward "plan"
  gri_fft_complex	  *invfft;		// inverse "plan"

  fwdfft_shifted_ref = new gri_fft_complex(fftsize, true);
  fwdfft_scat = new gri_fft_complex(fftsize, true);
  invfft = new gri_fft_complex(fftsize, false);

  //write_header(output, ndoppler_bins, min_doppler, max_doppler);
  write_header(output, nranges, 0, 0);
  int j = 0;

  for (j = 0; j < (int)(nranges*nranges); j++)
  {
       out_xy[j]=0;
  }
  for (j = 0; j < fftsize; j++)
  {
    fwdfft_shifted_ref->get_inbuf()[j] = 0;//Clear input of all ffts. Should only need to do this once
    fwdfft_scat->get_inbuf()[j] = 0;
    invfft->get_inbuf()[j]=0;//This is really needed
    shifted[j] = 0;//Clear array of inputs for inv_fft
  }

  gr_complex *a = fwdfft_shifted_ref->get_outbuf();
  gr_complex *b = fwdfft_scat->get_outbuf();
  gr_complex *c;// = invfft->get_inbuf();

  int offset_ref=(int)nranges;
  //Overlap-add fft implementation of correlation, outputting first nranges of correlation
  //Zero-padding is  nranges in front and back of ref-fft
  float phasedop=0.0;
  for(int f=0,offset_scat=0;offset_scat<(int)correlation_window_size;f++,offset_scat+=fftsize-2*(int)nranges)
  {
    memcpy(&fwdfft_scat->get_inbuf()[0], & scat0[offset_scat], fftsize * 1 *sizeof(gr_complex));//full fft is filled, no zero padding
    fwdfft_scat->execute();	// compute fwd xform
    //we only fill the middle of fwdfft_shifted_ref, so there is zero padding in first and last nranges
    //memcpy(&fwdfft_shifted_ref->get_inbuf()[nranges], &ref[offset_ref], (fftsize-2*nranges)*1 *sizeof(gr_complex));// reference, no shift since ndoppler=1
      //we only fill the middle of fwdfft_shifted_ref, so there is zero padding in first and last nranges
    phasedop=freq_xlate(&fwdfft_shifted_ref->get_inbuf()[nranges], &ref[offset_ref], (fftsize-2*nranges)*1, doppler,phasedop);

    fwdfft_shifted_ref->execute();	// compute fwd xform
    c=&shifted[0];
    for (j = 0; j < fftsize; j++)	// cross-correlate in the freq domain
      c[j] = c[j]+a[j] * conj(b[j]);//full integrator
      //c[j] = alpha*c[j]+(1.0-alpha)*a[j] * conj(b[j]);//moving average
    offset_ref+=fftsize-2*(int)nranges;
  } /* loop over nffts */

  memcpy(&invfft->get_inbuf()[0], & shifted[0], fftsize * 1 *sizeof(gr_complex));
  invfft->execute();
  gr_complex *unscaled_out=invfft->get_outbuf();
  int fft_bin;
  for (j = 0; j < (int)nranges; j++)
  {
      //xcp = [ xcpw(end-nlags+1:end);xcpw(1:nlags+1) ];
      if (j<=0)
        fft_bin=-j;
      else
        fft_bin=fftsize-j;//+1 ??
      gr_complex curr_val=conj(unscaled_out[fft_bin]);
      float curr_norm=(((unsigned)abs(j))<skip)?0.0:abs(curr_val);
      //double curr_phase=arg(curr_val);
      //out[j*ndoppler_bins+nd]=curr_norm;//skip>0 don't show direct path peak around zero
      gr_complex xy = (curr_norm>0)?curr_val/curr_norm:0;
      float x= float(j)*(xy.real()*0.5)+float(nranges)*0.5;
      float y= float(j)*(xy.imag()*0.5)+float(nranges)*0.5;
      //double x=double(j)*(cos(curr_phase)+1.0)*0.5;//x-range is 0 to nranges-1
      //double y=double(j)*(sin(curr_phase)+1.0)*0.5;//y-range is 0 to nranges-1
      out_xy[int(y+0.5)*nranges+int(x+0.5)]=curr_norm * scale_factor;
      //out[j*ndoppler_bins+nd]=norm(unscaled_out[j]) * scale_factor;
  } /* loop over nranges */

  write_floats(output,out_xy,nranges*nranges);

  delete [] out_xy;
  delete fwdfft_shifted_ref;
  delete fwdfft_scat;
  delete invfft;
  delete shifted;

}

void
main_loop(FILE *output, time_series &ref_ts, time_series &scat0_ts,
	  unsigned nranges, unsigned correlation_window_size,
	  float min_doppler, float max_doppler, int ndoppler_bins)
{
  fprintf(stderr, "ndoppler_bins = %10d\n", ndoppler_bins);
  fprintf(stderr, "min_doppler   = %10f\n", min_doppler);
  fprintf(stderr, "max_doppler   = %10f\n", max_doppler);


  int fftsize=correlation_window_size;//Should this multiplied by two because of the pos/neg freqs in the fft and we only use the positive ones?
                                      //Most efficient when this number can be factored into small primes
                                      //TODO maybe increase to the next power of two if it is not.
                                      //Then also pad with zero's or use some other window
  // float scale_factor = 1.0/correlation_window_size;	// FIXME, not sure this is right
  //float scale_factor = 1.0;				// FIXME, not sure this is right
  float scale_factor = 1.0;///fftsize;

  boost::scoped_array<gr_complex>  shifted(new gr_complex[correlation_window_size]);

  //const gr_complex *ref = (const gr_complex *) ref_ts.seek(0, correlation_window_size);//crosscorrelate
  const gr_complex *ref = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);//autocorrelate
  const gr_complex *scat0 = (const gr_complex *) scat0_ts.seek(0, correlation_window_size);
  // gr_complex shifted[correlation_window_size];		// doppler shifted reference

  //The FFt's
  /*FFTW computes an unnormalized transform, in that there is no coefficient in front of the summation in the DFT. In other words, applying the forward and then the backward transform will multiply the input by n.

From above, an FFTW_FORWARD transform corresponds to a sign of -1 in the exponent of the DFT. Note also that we use the standard “in-order” output ordering—the k-th output corresponds to the frequency k/n (or k/T, where T is your total sampling period). For those who like to think in terms of positive and negative frequencies, this means that the positive frequencies are stored in the first half of the output and the negative frequencies are stored in backwards order in the second half of the output. (The frequency -k/n is the same as the frequency (n-k)/n.) 
  */
  gri_fft_complex	  *fwdfft_shifted_ref;		// forward "plan"
  gri_fft_complex	  *fwdfft_scat;		// forward "plan"
  gri_fft_complex	  *invfft;		// inverse "plan"

  fwdfft_shifted_ref = new gri_fft_complex(fftsize, true);
  fwdfft_scat = new gri_fft_complex(fftsize, true);
  invfft = new gri_fft_complex(fftsize, false);
  /*fft implementation pseudocode:
     fft_ref=fft(ref_ts)
     loop over dopppler
       shifted=translatefreq(ref,current_doppler)
       fft_shifted=fft(shifted)
       loop over freqs
         fft_out[freq]=conjugatemult(fft_shifted[current_freq],fft_ref[current_freq])
       out[current_doppler]=ifft(fft_out)
   */
  float doppler_incr = 0;
  if (ndoppler_bins == 1){
    min_doppler = 0;
    max_doppler = 0;
  }
  else
    doppler_incr = (max_doppler - min_doppler) / (ndoppler_bins - 1);

  write_header(output, ndoppler_bins, min_doppler, max_doppler);
  int j = 0;
  for (j = 0 /*correlation_window_size*/; j < fftsize; j++)
    fwdfft_scat->get_inbuf()[j] = 0;
  memcpy(&fwdfft_scat->get_inbuf()[0], scat0, correlation_window_size * 1 *sizeof(gr_complex));
  //for (j = correlation_window_size; j < fftsize; j++)
  //  fwdfft_scat->get_inbuf()[j] = 0;
  fwdfft_scat->execute();	// compute fwd xform

  for (j = 0; j < fftsize; j++)
      fwdfft_shifted_ref->get_inbuf()[j] = 0;//Should only need to do this once
  float *out=new float[nranges*ndoppler_bins];
  for (int nd = 0; nd < ndoppler_bins; nd++){
    float fdop = min_doppler + doppler_incr * nd;
    //fprintf(stderr, "fdop = %10g\n", fdop);
    freq_xlate(&fwdfft_shifted_ref->get_inbuf()[0], ref, correlation_window_size*1, fdop,0.0);	// generated doppler shifted reference
    //fft_shifted=fft(shifted)

    fwdfft_shifted_ref->execute();	// compute fwd xform
    gr_complex *a = fwdfft_shifted_ref->get_outbuf();
    gr_complex *b = fwdfft_scat->get_outbuf();
    gr_complex *c = invfft->get_inbuf();

    for (j = 0; j < fftsize; j++)	// cross-correlate in the freq domain
      c[j] = a[j] * conj(b[j]);
    invfft->execute();
    gr_complex *unscaled_out=invfft->get_outbuf();
    int fft_bin;
    for (j = 0; j < (int)nranges; j++)
    {
        if (j<=0)
          fft_bin=j;
        else
          fft_bin=correlation_window_size-j;
        out[j*ndoppler_bins+nd]=(j<4)?0.0:norm(unscaled_out[fft_bin]) * scale_factor;//don't show direct path peak at zero
        //out[j*ndoppler_bins+nd]=norm(unscaled_out[j]) * scale_factor;
    }
  }
  write_floats(output,out,ndoppler_bins*nranges);
  delete [] out;
  delete fwdfft_shifted_ref;
  delete fwdfft_scat;
  delete invfft;
  /*unsigned long long ro = 0;	// reference offset
  unsigned long long so = 0;	// scatter offset

  for (unsigned int n = 0; n < nranges; n++){
    if (0){
      fprintf(stdout, "n =  %6d\n", n);
      fprintf(stdout, "ro = %6lld\n", ro);
      fprintf(stdout, "so = %6lld\n", so);
    }
    const gr_complex *ref = (const gr_complex *) ref_ts.seek(ro, correlation_window_size);
    const gr_complex *scat0 = (const gr_complex *) scat0_ts.seek(so, correlation_window_size);
    if (ref == 0 || scat0 == 0)
      return;

    for (int nd = 0; nd < ndoppler_bins; nd++){
      float fdop = min_doppler + doppler_incr * nd;
      //fprintf(stderr, "fdop = %10g\n", fdop);
      freq_xlate(&shifted[0], ref, correlation_window_size, fdop);	// generated doppler shifted reference

      gr_complex ccor = complex_conj_dotprod(&shifted[0], scat0, correlation_window_size);
      float out = norm(ccor) * scale_factor;

      // fprintf(output, "%12g\n", out);
      write_float(output, out);
    }

    so += 1;
  }*/
}

static void
usage(const char *argv0)
{
  const char *progname;
  const char *t = strrchr(argv0, '/');
  if (t != 0)
    progname = t + 1;
  else
    progname = argv0;
    
  fprintf(stderr, "usage: %s [options] ref_file scatter_file\n", progname);
  fprintf(stderr, "    -o OUTPUT_FILENAME [default=eb-xambi.out]\n");
  fprintf(stderr, "    -m MIN_RANGE [default=0]\n");
  fprintf(stderr, "    -M MAX_RANGE [default=300]\n");
  fprintf(stderr, "    -w CORRELATION_WINDOW_SIZE [default=2500]\n");
  fprintf(stderr, "    -s NSAMPLES_TO_SKIP [default=0]\n");
  fprintf(stderr, "    -S RANGES_TO_SKIP [default=0]\n");
  fprintf(stderr, "    -d max_doppler (normalized*1e6: [0, +0.5*1e6)) [default=.0012*1e6]\n");
  fprintf(stderr, "    -D min_doppler (normalized*1e6: [0, -0.5*1e6)) [default=.0012*1e6]\n");
  fprintf(stderr, "    -n ndoppler_bins [default=31]\n");
  fprintf(stderr, "    -x output in xy format, shows phase of reflections (implies -n 1) [default no xy]\n");
}

int
main(int argc, char **argv)
{
  int	ch;
  int min_range =    0;
  int max_range =  300;
  const char *ref_filename = 0;
  const char *scatter_filename = 0;
  const char *output_filename = "mdvh-xambi.out";
  unsigned int correlation_window_size = 2500;
  long long int nsamples_to_skip = 0;
  int skip=0;
  double max_doppler = 0.0012;
  double min_doppler = 1.0;
  int ndoppler_bins = 31;
  bool do_xy=false;


  while ((ch = getopt(argc, argv, "m:M:ho:w:s:S:d:D:n:x:")) != -1){
    switch (ch){
    case 'm':
      min_range = strtol(optarg, 0, 0);
      break;

    case 'M':
      max_range = strtol(optarg, 0, 0);
      break;

    case 'w':
      correlation_window_size = strtol(optarg, 0, 0);
      if (correlation_window_size <= 1){
	usage(argv[0]);
	fprintf(stderr, "    correlation_window_size must be >= 1\n");
	exit(1);
      }
      break;

    case 'o':
      output_filename = optarg;
      break;
      
    case 's':
      nsamples_to_skip = (long long) strtof(optarg, 0);
      if (nsamples_to_skip < 0){
	usage(argv[0]);
	fprintf(stderr, "    nsamples_to_skip must be >= 0\n");
	exit(1);
      }
      break;

    case 'S':
      skip = strtol(optarg, 0, 0);
      if (skip < 0){
	usage(argv[0]);
	fprintf(stderr, "    ranges_to_skip must be >= 0\n");
	exit(1);
      }
      break;

    case 'd':
      max_doppler = strtof(optarg, 0);
      if (max_doppler <= -0.5 || max_doppler >= 0.5){
	usage(argv[0]);
	fprintf(stderr, "    max_doppler must be in [-0.5e6, 0.5e6)\n");
	exit(1);
      }
      break;

    case 'D':
      min_doppler = strtof(optarg, 0);
      if (min_doppler <= -0.5|| min_doppler >= 0.5){
	usage(argv[0]);
	fprintf(stderr, "    min_doppler must be in [-0.5e6, 0.5e6)\n");
	exit(1);
      }
      break;

    case 'n':
      ndoppler_bins = strtol(optarg, 0, 0);
      if (ndoppler_bins < 1){
	usage(argv[0]);
	fprintf(stderr, "    ndoppler_bins must >= 1\n");
	exit(1);
      }
      break;

    case 'x':
      ndoppler_bins = 1;
      do_xy=true;
      break;

    case '?':
    case 'h':
    default:
      usage(argv[0]);
      exit(1);
    }
  } // while getopt

  if (argc - optind != 2){
    usage(argv[0]);
    exit(1);
  }

  if (max_range < min_range){
    usage(argv[0]);
    fprintf(stderr, "    max_range must be >= min_range\n");
    exit(1);
  }
  unsigned int nranges = max_range - min_range + 1;

  ref_filename = argv[optind++];
  scatter_filename = argv[optind++];

  FILE *output = fopen(output_filename, "wb");
  if (output == 0){
    perror(output_filename);
    exit(1);
  }

  unsigned long long ref_starting_offset = 0;
  unsigned long long scatter_starting_offset = 0;

  if (min_range < 0){
    ref_starting_offset = -min_range;
    scatter_starting_offset = 0;
  }
  else {
    ref_starting_offset = 0;
    scatter_starting_offset = min_range;
  }

  ref_starting_offset += nsamples_to_skip;
  scatter_starting_offset += nsamples_to_skip;

if (min_doppler> 0.5) min_doppler= -max_doppler;
  try {
    time_series ref(sizeof(gr_complex), ref_filename, ref_starting_offset, 0);
    time_series scat0(sizeof(gr_complex), scatter_filename, scatter_starting_offset, 0);
    if(do_xy)
    {
     // main_loop_nodoppler_xy_fft_overlap_add(output, ref, scat0, nranges, correlation_window_size,
	//      -max_doppler, max_doppler, ndoppler_bins,(unsigned) skip);
      printf("xy\n");
      main_loop_doppler_xy_fft_overlap_add(output, ref, scat0, nranges, correlation_window_size,
	      max_doppler, (unsigned) skip);
    }
    else
    {
      printf("1dim\n");
      main_loop_fft_overlap_add(output, ref, scat0, nranges, correlation_window_size,
	      min_doppler, max_doppler, ndoppler_bins,(unsigned) skip);
    }

  }
  catch (std::string &s){
    std::cerr << s << std::endl;
    exit(1);
  }

  return 0;
}


/*wienerfilter.m
function ex = wienerFilter(y,h,sigma,gamma,alpha);
%
% ex = wienerFilter(y,h,sigma,gamma,alpha);
%
% Generalized Wiener filter using parameter alpha. When
% alpha = 1, it is the Wiener filter. It is also called
% Regularized inverse filter.
%
% Reference: Richb's paper
% Created: Tue May 4 16:24:06 CDT 1999, Huipin Zhang

N = size(y,1);
Yf = fft2(y);//y=measured signal
Hf = fft2(h,N,N);//h=pointspread function of measuring system
//Pxf=pointspread function of clean signal
Pyf = abs(Yf).^2/N^2;

% direct implementation of the regularized inverse filter, 
% when alpha = 1, it is the Wiener filter
% Gf = conj(Hf).*Pxf./(abs(Hf.^2).*Pxf+alpha*sigma^2);*/


//gr_complex *Pxf=estimated_signal_spectrum;//==fft(delta function) = constant

//Gf = conj(Hf).*Pxf./(abs(Hf.^2).*Pxf+alpha*sigma^2);

//Gf = conj(Hf)/(abs(Hf.^2)+alpha*sigma^2);
//eXf = Gf.*Yf;
//ex = real(ifft2(eXf));*/

/*

Step 2: Simulate a Motion Blur

Simulate a a real-life image that could be blurred e.g., by camera motion. The example creates a point-spread function, PSF, corresponding to the linear motion across 31 pixels (LEN=31), at an angle of 11 degrees (THETA=11). To simulate the blur, the filter is convolved with the image using imfilter.

LEN = 31;
THETA = 11;
PSF = fspecial('motion',LEN,THETA);
Blurred = imfilter(I,PSF,'circular','conv');
figure; imshow(Blurred);
title('Blurred');

Step 4: Simulate Additive Noise

Simulate additive noise by using normally distributed random numbers and add it to the blurred image, Blurred, created in Step 2.
noise = 0.1*randn(size(I));
BlurredNoisy = imadd(Blurred,im2uint8(noise));
figure;imshow(BlurredNoisy);title('Blurred & Noisy');

To control the noise amplification, provide the noise-to-signal power ratio, NSR.

NSR = sum(noise(:).^2)/sum(im2double(I(:)).^2);
wnr5 = deconvwnr(BlurredNoisy,PSF,NSR);
figure;imshow(wnr5);
title('Restored with NSR');


Step 6: Use Autocorrelation to Improve Image Restoration

To improve the restoration of the blurred and noisy images, supply the full autocorrelation function (ACF) for the noise, NCORR, and the signal, ICORR.

NP = abs(fftn(noise)).^2;
NPOW = sum(NP(:))/prod(size(noise)); % noise power
NCORR = fftshift(real(ifftn(NP))); % noise ACF, centered
IP = abs(fftn(im2double(I))).^2;
IPOW = sum(IP(:))/prod(size(I)); % original image power
ICORR = fftshift(real(ifftn(IP))); % image ACF, centered

wnr7 = deconvwnr(BlurredNoisy,PSF,NCORR,ICORR);
figure;imshow(wnr7);
title('Restored with ACF');

Explore the restoration given limited statistical information: the power of the noise, NPOW, and a 1-dimensional autocorrelation function of the true image, ICORR1.

ICORR1 = ICORR(:,ceil(size(I,1)/2));
wnr8 = deconvwnr(BlurredNoisy,PSF,NPOW,ICORR1);
figure;imshow(wnr8);
title('Restored with NP & 1D-ACF');

wienerdriver.m
clear;clf;

% Input and display the binary image
N = 256;
x = readbin('lenna.256',N,N);
figure(1)
imshow(x,gray(256))

% Blur the image, corrupt the image using WGN and display it
% h is the blurring filter, and sigma is the noise std
h = ones(4,4)/16;
sigma = 10;

Xf = fft2(x);
Hf = fft2(h,N,N);
y = real(ifft2(Hf.*Xf))+sigma*randn(N,N); % circular convolution
%y = filter2(h,x)+sigma*randn(N,N);	  % linear convolution

figure(2)
imshow(y,gray(256))

% restoration using generalized inverse filtering
gamma = 2;
eix = inverseFilter(y,h,gamma);
figure(3)
imshow(eix,gray(256))

% restoration using generalized Wiener filtering
gamma = 1;
alpha = 1;
ewx = wienerFilter(y,h,sigma,gamma,alpha);
figure(4)
imshow(ewx,gray(256))

PSNR = [psnr(y,x) psnr(eix,x) psnr(ewx,x)]
return

wienerfilter.m
function ex = wienerFilter(y,h,sigma,gamma,alpha);
%
% ex = wienerFilter(y,h,sigma,gamma,alpha);
%
% Generalized Wiener filter using parameter alpha. When
% alpha = 1, it is the Wiener filter. It is also called
% Regularized inverse filter.
%
% Reference: Richb's paper
% Created: Tue May 4 16:24:06 CDT 1999, Huipin Zhang

N = size(y,1);
Yf = fft2(y);//y=measured signal
Hf = fft2(h,N,N);//h=pointspread function of measuring system
Pyf = abs(Yf).^2/N^2;

% direct implementation of the regularized inverse filter, 
% when alpha = 1, it is the Wiener filter
% Gf = conj(Hf).*Pxf./(abs(Hf.^2).*Pxf+alpha*sigma^2);
%
% Since we don't know Pxf, the following 
% handle singular case (zero case)
sHf = Hf.*(abs(Hf)>0)+1/gamma*(abs(Hf)==0);
iHf = 1./sHf;
iHf = iHf.*(abs(Hf)*gamma>1)+gamma*abs(sHf).*iHf.*(abs(sHf)*gamma<=1);

Pyf = Pyf.*(Pyf>sigma^2)+sigma^2*(Pyf<=sigma^2);
Gf = iHf.*(Pyf-sigma^2)./(Pyf-(1-alpha)*sigma^2);

% max(max(abs(Gf).^2)) % should be equal to gamma^2
% Restorated image without denoising
eXf = Gf.*Yf;
ex = real(ifft2(eXf));

return
*/
