#include <stdlib.h>

/* Contains the various covariance functions. They generally are
   called by the function covfun(), which will know how to handle the
   information given in the covfunconfig. */

float covfun_dyn(float const x, float const y, float const z, float const t,
		 float const x0, float const y0, float const z0, float const t0,
		 float const tau, float const invllat2, float const invllon2, float const invlt) {
  /* This is the covariance function used. x,y,z are the first point's
     Cartesian coordinates on unit ball, and x0, y0, z0 are the second
     ones. invllat2 is the inverse square of the correlation length in
     the latitude direction, invllon same for the longitudinal
     direction, and invlt is the inverse correlation length NOT
     squared (for computational reasons) of the time correlation
     parameter. tau^2 is the maximum correlation available */

  /* For single exponential; FIXME make this into a parameter of covfun_dyn */
  /* return tau*tau*expf(-(fabsf(x-x0) + fabsf(y-y0))*sqrt(invllon2) - fabsf(z-z0)*sqrt(invllat2) - fabsf(t-t0)*invlt); */

  const float exponent = 2.;

  return tau*tau*expf(-(powf(x-x0, exponent) + powf(y-y0, exponent))*invllon2 - powf(z-z0, exponent)*invllat2 - powf((t-t0)*invlt, exponent));
}

float const covfun_periodic(float const x, float const y, float const z, float const t,
		      float const x0, float const y0, float const z0, float const t0,
		      float const tau, float const invllat2, float const invllon2, float const invlperiodic2) {
  /* This is the covariance function used for the periodic
     kernel. x,y,z are the first point's Cartesian coordinates on unit
     ball, and x0, y0, z0 are the second ones. The rest are
     parameters, see the formulas below for how they function. */
  float s;
  s = sinf(M_PI*(t-t0)/365.25/24/3600);
  s = s*s;
  return tau*tau*expf(-(powf(x-x0, 2) + powf(y-y0, 2))*invllon2 - powf(z-z0, 2)*invllat2 -
		       2*s*invlperiodic2);
}

float covfun_wind(float x, float y, float z, float t, float x0, float y0, float z0, float t0,
		  float tau, float invl2, float invlt, float rho,  float u, float v) {
  /* This kernel works so, that the spatial covariance is round when
     wind is zero, but with wind the spatial scale parameter is
     elongated by sqrt(1+W*rho). */

  /* un and vn are normalized u and v */
  float s, un, vn, unperp, vnperp;

  float lat0, lon0, lat, lon;

  /* wind-parallel and perpendicular parts of vector difference */
  float parpart, perppart;
  /* Account for latitude in distance calculation. See comment below. */
  const float latscale = 1.;

  /* Wind vector length */
  float W = powf(u*u + v*v, 0.5);

  /* Special cases when rho, u, or v is zero */
  if (rho*W == 0) {
    return covfun_dyn(x, y, z, t, x0, y0, z0, t0, tau, invl2, invl2, invlt);
  } else {
    if ((!u) || (!v)) {
      vn = (!u) ? 1 : 0;
      un = (!v) ? 1 : 0;
    } else {
      un = u/W;
      vn = v/W;
    }
  }

  xyz_to_latlon(x,y,z,&lat,&lon);
  xyz_to_latlon(x0,y0,z0,&lat0,&lon0);

  unperp = vn;
  vnperp = -un;

  parpart = un*(lon-lon0) + vn*(lat-lat0);
  perppart = unperp*(lon-lon0) + vnperp*(lat-lat0);

  /* latscale cannot be used since it would make things
     ill-conditioned due to latitude changing very little between
     observations in the covariance matrix. A mechanism to use the
     same factor every time should be added. The proper way would be
     to use lat-lon coordinates directly and to compensate for
     distortion close to poles. For this same reason we cannot use
     x,y,z for distances but must go with lat-lon.

     latscale = 1./sinf((90-lat)/180*M_PI);
  */

  /* Unit ball distance scaling as is usual */
  parpart = 2*M_PI/360*parpart*latscale;
  perppart = 2*M_PI/360*perppart*latscale;

  s = tau*tau*expf(-parpart*parpart*invl2*powf(1+W*rho, -.5)
		   - perppart*perppart*invl2 - powf((t-t0)*invlt, 2));
  return s;
}

float get_matern_b_fact(int n, int j) {
  /* This function gives the b factors for the Matérn covariance
     kernel computation; see e.g. Santner et al: The Design and
     Analysis of Computer Experiments (2003), p. 43. */
  const float nu = n + .5;
  /* Note that tgamma(n) = n! */

  /* Also note that the b factors depend on nu, but not the theta
     parameters, meaning that these could be actually
     precomputed... */
  return sqrt(M_PI)*powf(nu, 0.5*(n-j))*powf(0.25, j)/tgammaf(nu)*tgamma(n+j+1)/tgamma(j+1)/tgamma(n-j+1);
}

float const covfun_matern52(float const x, float const y, float const z, float const t,
		      float const x0, float const y0, float const z0, float const t0,
		      float const tau, float const invllat2, float const invllon2, float const invlt,
		      float * restrict bfact) {
  /* This is the Matérn covariance function with smoothness parameter
     5/2, maximum covariance tau^2 and directional covariances of llat
     and llon. The half-integer covariance functions have nice forms
     (finite sums). The time depedence is the squared exponential
     one. Arguments are inverses of the length scale parameters in
     both space and time. */
  float S = 0;
  float B = 0;
  int j;
  const int n = 2; /* nu = n + 1/2; due to that we have nu=5/2 */
  const float nu = n + 0.5;

  /* ht corresponds to |h|/theta in Santern et al. so that we scale
     the eucleidian dimensions separately with different
     parameters. We could take a product of the different Matern
     kernels, but I rather do it in one go and then calculate the
     kernel only once. */
  const float ht = sqrtf((powf(x-x0, 2) + powf(y-y0, 2))*invllon2 +
			powf(z-z0, 2)*invllat2 +
			powf((t-t0)*invlt, 2));

  for (j=0; j<=n; j++) {
    B += bfact[j]*powf(ht, n-j);
  };

  S = tau*tau*B*expf(-2*sqrt(nu)*ht);
  return S;

}

float covfun_stat(float x, float y, float z,
		  float x0, float y0, float z0, float tau, float invllat2, float invllon2) {
  /* Same as above, but without the time component - so assuming that
     everything happens at the same time. */

  return tau*tau*expf(-(powf(x-x0, 2) + powf(y-y0, 2))*invllon2 - powf(z-z0, 2)*invllat2);
}

float const covfun(float const x1, float const y1, float const z1, float const t1,
	     float const x2, float const y2, float const z2, float const t2, struct covfunconfig * restrict cfconfig,
	     float * u, float * v) {
  switch (cfconfig->covftype) {
  case 1 :
    return covfun_dyn(x1, y1, z1, t1, x2, y2, z2, t2, cfconfig->tau,
		      cfconfig->invllat2, cfconfig->invllon2, cfconfig->invlt);
  case 2 :
    return covfun_stat(x1, y1, z1, x2, y2, z2, cfconfig->tau,
		       cfconfig->invllat2, cfconfig->invllon2);
  case 3 :
    return covfun_periodic(x1, y1, z1, t1, x2, y2, z2, t2, cfconfig->tau,
			   cfconfig->invllat2, cfconfig->invllon2, cfconfig->invlperiodic2);
  case 4 :
    return covfun_matern52(x1, y1, z1, t1, x2, y2, z2, t2, cfconfig->tau,
			      cfconfig->invllat2, cfconfig->invllon2, cfconfig->invlt,
			      cfconfig->bfact);
  case 5 : ;
    return covfun_wind(x1, y1, z1, t1, x2, y2, z2, t2, cfconfig->tau,
		       cfconfig->invllat2, cfconfig->invlt, cfconfig->rho, *u, *v);
  case -1 :
  case -2 :
  case -3 :
  case -4 :
  case -5 :
  case -6 :
  case -7 :
  case -8 :
  case -9 : ;
    float S = 0;
    size_t i;
    for (i=0; i<-(cfconfig->covftype); i++) {
      S += covfun(x1, y1, z1, t1, x2, y2, z2, t2, cfconfig->kernels[i], u, v);
    }
    return S;

  default :
    printf("Invalid covftype: %d, exiting!\n", cfconfig->covftype);
    exit(-3);
  }
}


void set_kernel_parameters(struct covfunconfig *cfconf, float tau,
			   float llat, float llon, float lt, float lperiodic, float rho) {
  cfconf->tau = tau;
  cfconf->llat = llat;
  cfconf->invllat2 = 1./llat/llat;
  cfconf->llon = llon;
  cfconf->invllon2 = 1./llon/llon;
  cfconf->lt = lt;
  cfconf->invlt = 1./lt;
  cfconf->lperiodic = lperiodic;
  cfconf->invlperiodic2 = 1./lperiodic/lperiodic;

  if (cfconf->covftype == 5) {
    cfconf->rho = rho;
  }
}

void set_kernel_parameter_limits_low(struct covfunconfig *cfconf, float tau,
				     float llat, float llon, float lt, float lperiodic, float rho) {
  /* Sets the parameter limits in struct covfunconfig - so that we can
     optimize or run MCMC within these bounds. This function sets the
     lower limits */

  cfconf->tau_low = tau;
  cfconf->llat_low = llat;
  cfconf->llon_low = llon;
  cfconf->lt_low = lt;
  cfconf->lperiodic_low = lperiodic;
  cfconf->rho_low = rho;
}

void set_kernel_parameter_limits_high(struct covfunconfig *cfconf, float tau,
				      float llat, float llon, float lt, float lperiodic, float rho) {
  /* Sets the parameter limits in struct covfunconfig - so that we can
     optimize or run MCMC within these bounds. This function sets the
     higher limits */

  cfconf->tau_high = tau;
  cfconf->llat_high = llat;
  cfconf->llon_high = llon;
  cfconf->lt_high = lt;
  cfconf->lperiodic_high = lperiodic;
  cfconf->rho_high = rho;
}


float get_radius(struct covfunconfig *cfc, float min_cov) {
  /* Gets the maximum radius beyond which we will never consider
     observations to be included in the K matrix construction.

     FIXME: Actually, a cleaner way might be to just do covfun to x and z
     directions separately and take max of those.
  */
  int i;
  float max_rad = 0;
  if (cfc->covftype < 0) {
    for (i=0; i<-cfc->covftype; i++) {
      max_rad = fmax(max_rad, get_radius(cfc->kernels[i], min_cov));
    }
  } else {
    switch(cfc->covftype) { /* These are all the same. */
    case 1:
    case 2:
    case 3:
      max_rad = fmax(cfc->llat, cfc->llon)*sqrt(-log(min_cov*powf(cfc->tau,-2)));
      max_rad += 1e-4;
      break;
    case 4:
      /* Approximative quick way to do this here, just move little
	 by little on z-axis to find first point where we are not
	 good */
      while (covfun_matern52(0, 0, 0, 0, 0, 0, max_rad, 0, cfc->tau, cfc->invllat2,
			     cfc->invllon2, cfc->invlt, cfc->bfact) - min_cov > 0) {
	max_rad += 1e-4;
      }
      break;
    case 5:
      /* Same as case 4. Max wind 20. */
      while (covfun_wind(0, 0, 0, 0, 0, 0, max_rad, 0, cfc->tau,
			 cfc->invllat2, 1, cfc->rho, 0, 20) - min_cov > 0) {
	max_rad += 1e-4;
      }
      break;
    }
  }
  if (DEBUG) {
    printf("Got max rad: %f\n", max_rad);
  }
  return fmaxf(0.02, max_rad);
}

float get_max_tdist(struct covfunconfig *cfconf, float min_cov) {
  float t = 0;
  float u = 0;
  float v = 0;

  if (cfconf->kernels[0]->covftype == 2) {
    return 1e9; /* Static kernel does not care about time */
  }
  while (covfun(0, 0, 0, 0, 0, 0, 0, t*24*3600, cfconf, &u, &v) - min_cov > 0) {
    // printf("tdiff, cov: %f, %f\n", t, covfun(0, 0, 0, 0, 0, 0, 0, t*24*3600, cfconf, &u, &v));
    t += 1;
    /* Sometimes periodic kernels just keep it always above min_cov if
       that is very small, so we have a hard limit */
    if (t>50000) {
      break;
    }
  }
  if (DEBUG) {
    printf("Got max time distance: %f days\n", t);
  }
  return t;
}

void set_max_distances_in_config(struct config *E) {
  /* Max distance in days to look at observations. Beyond this the
     correlation will be too weak for that observations anywhere would
     contribute over min_cov. */
  E->max_tdist = (double) get_max_tdist(E->cfc, E->min_cov);

  /* Correlation radius on unit sphere for distance corr_cutoff based
     on parameters l and tau */
  E->max_rad = (double) get_radius(E->cfc, E->min_cov);
  E->max_rad_equator_degrees = E->max_rad/M_PI*180;
}

/* FIXME: the lperiodic should be called lmisc! */
struct covfunconfig *initialize_covfunconfig(int covftype, int nmaxobs, float tau, float llat,
					     float llon, float lt, float lperiodic, float rho) {
  /* Initializes the covfunconfig struct. Note, that max_rad and
     max_t_distance need still to be set in E e.g. by calling
     set_max_distances() */

  struct covfunconfig *cfconf;
  int i;

  cfconf = malloc(sizeof(struct covfunconfig));

  cfconf->covftype = covftype;
  cfconf->nmaxobs = nmaxobs;

  if (cfconf->covftype < 0) {
    cfconf->tau = 0;
    cfconf->llat = 0;
    cfconf->invllat2 = 0;
    cfconf->llon = 0;
    cfconf->invllon2 = 0;
    cfconf->lt = 0;
    cfconf->invlt = 0;
    cfconf->lperiodic = 0;
    cfconf->invlperiodic2 = 0;
    cfconf->rho = 0;
    cfconf->kernels = malloc(-covftype*sizeof(struct covfunconfig*));
    for (i=0; i<-covftype; i++) {
      cfconf->kernels[i] = NULL;
    }
  } else {
    /* Actually set kernel parameters */
    set_kernel_parameters(cfconf, tau, llat, llon, lt, lperiodic, rho);
    /* Set the limits of the kernel parameters here */
    set_kernel_parameter_limits_low(cfconf, tau, llat, llon, lt, lperiodic, rho);
    set_kernel_parameter_limits_high(cfconf, tau, llat, llon, lt, lperiodic, rho);
    cfconf->kernels = malloc(sizeof(struct covfunconfig*));
    cfconf->kernels[0] = cfconf;

    if (cfconf->covftype == 4) {
      int n = 2; /* this must change for changing the matern nu */
      /* We need to precalculate the b-factors for performance */
      cfconf->bfact = malloc((n+1)*sizeof(float));
      int j;
      for (j=0; j<=n; j++) {
	cfconf->bfact[j] = get_matern_b_fact(n, j);
      }
    }
  }

  return cfconf;
}

float * pick_observations(struct state * restrict S, struct  config * restrict E, size_t const r, float tt, size_t *obsind, size_t *nobs, float *prior_var, int const disregard_time) {
  /* Convenience function for picking observations from a state object
     S and returning an index with which to pick the observations from
     the S->{xyz...} objects.

     if (disregard_time), then time is set to be the time of
     observations in the covariance function when picking
     observations, and time-based prefiltering is disabled. This is
     for fitting the beta parameters, where a covariance matrix also
     needs to be formed.
  */

  size_t i, k, k1, k2, l, ii, jj, gg, ll, l2, mm, nn, ncloseenough, nobs_i, ind;
  int dn, dn2, day, already_included, cat, ncats, outnext;
  float w, cf, one_over_sin_lat;

  int latidx, lonidx, max_latidx, min_latidx, max_lonidx, min_lonidx;

  /* We'll need to get back to this after reassigning cfconf in a loop */
  struct covfunconfig *cfc_i;

  float const min_cov = (S->min_cov_override) ? S->min_cov_override : E->min_cov;
  float max_dyn_tdist = 0;
  float max_per_tdist = 0;
  float const min_dyn_tdist = 3*24*3600;

  size_t *obsind_i, *sorting;
  float *cfvalues;
  size_t deficit = 0;

  float zf = 0;
  size_t g;
  size_t **catlists;
  size_t *catcounts;

  /* Number of kernels for the for-loop */
  int nkernels = (E->cfc->covftype < 0) ? -E->cfc->covftype : 1;
  size_t offset = 0; /* For adding compound kernels' subkernels */

  size_t NLIMIT = 1e6; // max number of obs that we have
  *prior_var = 0; /* Calculate prior_var from zero */
  obsind_i = malloc(NLIMIT*sizeof(size_t));
  cfvalues = calloc(NLIMIT, sizeof(float));
  sorting = malloc(NLIMIT*sizeof(size_t));
  *nobs = 0;

  float proxfactor;
  /* How many times cfc_i->nmaxobs we consider for each one? Needs to
     be more for wind-informed kernels when using winds for
     prediction, i.e. E->dependent_variable == 0. */
  proxfactor = ((E->use_wind) && (!E->dependent_variable)) ? 8 : 1;

  latidx = r/E->nlon;
  lonidx = r%E->nlon;

  int d0, d1;
  int dd = (int) (E->max_rad_equator_degrees/E->gridres + 0.4999);

  max_latidx = latidx + dd;
  max_latidx = (E->nlat - 1 < max_latidx) ? E->nlat - 1 : max_latidx;
  min_latidx = latidx - dd;
  min_latidx = (min_latidx > 0) ? min_latidx : 0;

  one_over_sin_lat = 1. / sinf((90 - E->lat[r/E->nlon])/180*M_PI);
  min_lonidx = (int) (fmax(0, (lonidx - dd*one_over_sin_lat)));
  max_lonidx = (int) (fmin(E->nlon - 1, lonidx + dd*one_over_sin_lat));

  /* printf("extents idx: %d, %d, %d, %d\n", min_latidx, max_latidx, min_lonidx, max_lonidx); */

  /* We iterate either over all the kernels in a compound kernel or
     then we just deal with the single kernel that was specified. */
  for (k=0; k<nkernels; k++) {
    outnext = 0;

    cfc_i = E->cfc->kernels[k];
    *prior_var += cfc_i->tau*cfc_i->tau;

    /* If E->mode != 0, ie. we are not doing gridGP, then E->gi_inc
       should equal at least E->ngp */
    if (E->mode) { assert(E->gi_inc >= E->ngp); }

    /* Find the number of observations that are close enough to count,
       as determined by min_cov. */
    ncloseenough = 0;

    if (!disregard_time) {

      max_dyn_tdist = fmaxf(min_dyn_tdist, sqrt(-cfc_i->lt/86400.*cfc_i->lt/86400.*
						log(min_cov/cfc_i->tau/cfc_i->tau)));

      if (cfc_i->covftype == 3) {
	max_per_tdist = 365.25/M_PI*
	  asinf(sqrt(-.5*cfc_i->lperiodic*log(min_cov/cfc_i->tau/cfc_i->tau)));
      }

      /* The day that we are in (how many 24h periods from midnight of the
	 first day). */
      day = daynumber_from_t(tt);
    }


    d0 = ((disregard_time) || (cfc_i->covftype == 3)) ? 0 : (int) fmaxf(day - max_dyn_tdist, 0);
    d1 = (disregard_time) ? E->maxdays : (int) fminf(day + max_dyn_tdist, E->maxdays);

    if (!E->mode) { /* gridGP or sampling */

      /* Observations are searched according to proximity to current
	 point. Once we have enough, we no longer mind the
	 others. Covariance halves category by category. The added 2 is a fudge factor*/
      ncats = (int) -floor(log2(E->min_cov/E->cfc->kernels[0]->tau/E->cfc->kernels[0]->tau/2.));
      /* Make sure we have at least one category - ncats above can be
	 negative with very small tau. In that case no observations
	 are enough near. */
      ncats = (ncats < 0) ? 1 : ncats;

      catlists = calloc(ncats, sizeof(size_t*));
      catcounts = calloc(ncats, sizeof(size_t));
      for (i=0; i<ncats; i++) {
	catlists[i] = malloc(NLIMIT*sizeof(size_t));
	/* Zero these before starting with a new kernel */
	catcounts[i] = 0;
      }

      for (dn=d0; dn<d1; dn++) {
	if (!disregard_time) {
	  w = fabs(dn - day);
	  /* Skip if no obs, if outside time range for dynamic kernel,
	     or if outside day range for periodic kernel. */
	  if (((cfc_i->covftype == 3) && (fmin(fmod(w, 365.25), fmod(365.25 - fmod(w, 365.25), 365.25))) > max_per_tdist)) {
	    continue;
	  }
	} else {
	  tt = S->first_day_noon + 86400*dn;
	}

	/* Category counts calculated only for gridded runs */
	for (ii=0; ii<S->obs_grid_mask_counts[dn]; ii++) {

	  g = S->obs_grid_mask[dn][ii];

	  if ((g/E->nlon < min_latidx) || (g%E->nlon < min_lonidx) ||
	      (g/E->nlon > max_latidx) || (g%E->nlon > max_lonidx)) {continue;}

	  // FIXME there should be tolerance of half grid point here
	  float aa, bb, cc;
	  aa = covfun(E->x[r], E->y[r], E->z[r], tt,
		      E->x[g], E->y[g], E->z[g], S->first_day_noon + 86400*dn,
		      cfc_i, &zf, &zf);
	  //printf("cat, ii, cf: %d,%zu, %f\n", cat,ii,aa);
	  if (!aa) {continue;}
	  bb = log2(aa/cfc_i->tau/cfc_i->tau);
	  cc = -floor(bb);
	  cat = (int) cc;
	  cat = cat < ncats ? cat : ncats - 1;
	  catlists[cat][catcounts[cat]++] = E->ngp*dn+g;
	}
      }
      /* Category counts done */

      for (ii=0; ii<ncats-1; ii++) { // Skip the leftover category with not enough covariance

	for (jj=0; jj<catcounts[ii]; jj++) {
	  gg = catlists[ii][jj];
	  dn2 = gg/E->ngp;
	  g = gg%E->ngp;

	  /* cfvalues below can be e.g. used to pick the points with most
	     covariance, instead of just even thinning */
	  for (i=0; i<S->griddaycounts[g][dn2]; i++) {
	    ind = S->gridobsind[g][dn2][i];

	    /* Set pointers if needed to get wind fields. Uncomment
	       and use u/v in covfun below if you want to do already
	       first observation selection based on winds. Actually
	       having winds of 0 here is probably more robust and also
	       faster.

	    float *u = NULL, *v = NULL;

	    if (E->use_wind) {
	      u = &(S->u[ind]);
	      v = &(S->v[ind]);
	    } */

	    /* Time axis does not matter if training local meanfunction
	       betas, which is when disregard_time is used. Therefore,
	       make all time differences in covariance function
	       evaluations zero.*/
	    if (disregard_time) {
	      tt = S->t[ind];
	    }

	    cf = covfun(E->x[r], E->y[r], E->z[r], tt, S->x[ind], S->y[ind],
			S->z[ind], S->t[ind], cfc_i, &zf, &zf);

	    /* Record the obs that were informative */
	    cfvalues[ncloseenough] = cf;
	    obsind_i[ncloseenough] = ind;
	    sorting[ncloseenough] = ncloseenough;
	    ncloseenough += (cf > min_cov);
	  }
	}
	/* We already have lots of observations so we can skip next
	   categories for this kernel */
	if (outnext) {
	  outnext = 0;
	  break;
	}
	/* If we have this many observations, we can just skip the rest
	   - but let's read the next batch just to be on the safe
	   side. */
	if (ncloseenough > proxfactor*cfc_i->nmaxobs) {
	  outnext = 1;
	}
      }
    } else { // E->mode is nonzero. No categories are used.
      for (dn=d0; dn<d1; dn++) {
	if (!disregard_time) {
	  w = fabs(dn - day);
	  /* Skip if no obs, if outside time range for dynamic kernel,
	     or if outside day range for periodic kernel. */
	  if (((cfc_i->covftype == 3) && (fmin(fmod(w, 365.25), fmod(365.25 - fmod(w, 365.25), 365.25))) > max_per_tdist)) {
	    continue;
	  }
	}
	for (i=0; i<S->griddaycounts[r][dn]; i++) {
	  ind = S->gridobsind[r][dn][i];

	  /* Set pointers if needed to get wind fields. FIXME, now
	     winds are taken from individual observations, instead of
	     e.g. the center. On the other hand, if gridded
	     observations are not available, this is what's available,
	     so it might not be so bad after all.

	  if (E->use_wind) {
	    u = &(S->u[ind]);
	    v = &(S->v[ind]);
	  }
	  */

	  /* Time axis does not matter if training local meanfunction
	     betas, which is when disregard_time is used. Therefore,
	     make all time differences in covariance function
	     evaluations zero.*/
	  if (disregard_time) {
	    tt = S->t[ind];
	  }

	  cf = covfun(E->x[r], E->y[r], E->z[r], tt, S->x[ind], S->y[ind],
		      S->z[ind], S->t[ind], cfc_i, &zf, &zf);

	  /* Record the obs that were informative */
	  cfvalues[ncloseenough] = cf;
	  obsind_i[ncloseenough] = ind;
	  sorting[ncloseenough] = ncloseenough;
	  ncloseenough += (cf > min_cov);
	}
      }
    }

    assert(ncloseenough<=NLIMIT); /* sanity check */

    /* If we had more observations nearby than our prescribed maximum
       number cfc_i->nmaxobs + any leftover from previous kernels, we
       will thin the data. With best_data we take the ones having most
       covariance with the candidate point. Otherwise we take evenly
       spaced data. If there is leftover space in the covariance
       kernel, then that space will be filled here. */

    nobs_i = fmin(ncloseenough, cfc_i->nmaxobs + deficit);
    /* FIXME make obs selection method user-configurable */

    /* FIXME we could still in the end of function pick the very best
       of all data, make into switch case and update comments
       accordingly. */

    /* (1) for choosing the closest points, (0) for random thinning */
    const int select_closest = 1;

    if (select_closest) {
      /* Use points with highest covariance for current kernel */
      qsort_r(sorting, ncloseenough, sizeof(size_t), &compare_inverse, cfvalues);
    } else {
      thin_data_size_t(obsind_i, ncloseenough, nobs_i);
    }

    if (DEBUG) {
      /* Show the picked observation indexes in S->grid*[gi] and their
	 covariances */
      for (l=0; l<ncloseenough; l++) {
	printf("Picked obs; (obsind_i[sorting[%zu]], cov) = %zu, %f\n",
	       l, obsind_i[sorting[l]], cfvalues[sorting[l]]);
      }
    }

    /* Add data if it was not added by previous kernels */
    k1 = 0;
    for (l=0; l<ncloseenough; l++) {
      already_included = 0;
      for (l2=0; l2<offset; l2++) {
	if (obsind[l2] == obsind_i[sorting[l]]) {
	  already_included = 1;
	  break;
	}
      }
      if (!already_included) {
	obsind[offset + k1] = obsind_i[sorting[l]];
	k1++;
      }
      if (k1 == nobs_i) {
	break;
      }
    }
    offset += k1;
    /* Add deficit if we did not fill the full covariance kernel */
    deficit = 0;
    for (k2=0; k2<=k; k2++) {
      deficit += E->cfc->kernels[k2]->nmaxobs;
    }
    deficit -= offset;

    if (DEBUG) {
      printf("DEFICIT of %ld!\n", deficit);
    }
    /* Each kernel had these allocated */
    if (!E->mode) {
      for (i=0; i<ncats; i++) {
	free(catlists[i]);
      }
      free(catlists);
      free(catcounts);
    }
  }

  float *sorted_cfvalues = malloc(ncloseenough*sizeof(float));
  for (i=0; i<ncloseenough; i++) {
    sorted_cfvalues[i] = cfvalues[sorting[i]];
  }

  /* Done with choosing observations */
  free(obsind_i);
  free(cfvalues);
  free(sorting);

  /* Make your array such that it's unique in that we have no
     duplicates; otherwise cholesky will fail.

     FIXME move this into a size_t unique(float *arr, size_t N)
     function. */
  nn = 0;
  int b;
  for (ll=0; ll<offset; ll++) {
    b = 0;
    for (mm=ll+1; mm<offset; mm++) {
      if (obsind[ll] == obsind[mm]) {
	b = 1;
	break;
      }
      if (b) {continue;}
    }
    obsind[nn] = obsind[ll];
    nn++;
  }

  *nobs = nn;

  return sorted_cfvalues;
}

struct cholstruct* create_cholstruct(struct state *S, struct  config *E, size_t r, float tt, struct state *S_predicted, int disregard_time, float *uwind_in, float *vwind_in) {
  /* Create the cholstruct object containing e.g. the gaussian process
     covariance matrix K. The object will be created according to the
     configuration in the E->cfc object. The argument r is the global
     grid index number

     With disregard_time, time is not taken to account when _picking_
     observations, but it still counts in the construction of the
     covariance matrix. This functionality exists for training local
     meanfunction beta coefficients.
  */

  /* gi is the index of the grid point in S.grid{x,y,z} */
  size_t i, j;

  /* observation index in S->gridx[gi] */
  size_t NLIMIT = 1e6;
  size_t *obsind1 = malloc(NLIMIT*sizeof(size_t));

  size_t nobs1 = 0;
  size_t nobs2 = 0;
  size_t *obsind2 = NULL;

  size_t *oiptr1, *oiptr2;

  float prior_var;

  float u = -10000.;
  float v = -10000.; /* Initialize to nonsense */

  float *cfvalues1 = NULL;
  float *cfvalues2 = NULL;

  prior_var = 0;

  /* Get the indexes of the actual observations used, which are
     returned in obsind1 with nobs1 giving the number of those. */
  cfvalues1 = pick_observations(S, E, r, tt, obsind1, &nobs1, &prior_var, disregard_time);
  /* if (nobs1) {printf("nobs1: %zu\n", nobs1);} */

   /* If we have another state object with observations - namely, if
      we are sampling realizations of the Gaussian process. */
  if (S_predicted) {
    /* FIXME: would we want to keep a different number of these obs
       than the ones from S? */
    obsind2 = malloc(NLIMIT*sizeof(size_t));
    cfvalues2 = pick_observations(S_predicted, E, r, tt, obsind2, &nobs2, &prior_var, disregard_time);
    exit(3); // THIS DOES NOT WORK WITH THE NEW OBS SEARCHING, NEEDS TO BE UPDATED
  }

  /* Get total number of observations regardless to whether we have
     one or two state objects to deal with */
  size_t ntotobs = ((S_predicted != NULL) ? nobs1 + nobs2 : nobs1);

  struct cholstruct *C = malloc(sizeof(struct cholstruct));

  initialize_cholstruct(C, (size_t) ntotobs);
  /* Record total max kernel size for the cholstruct */
  C->max_tot_ksize = 0;
  for (i=0; i<-E->cfc->covftype; i++) {
    C->max_tot_ksize += E->cfc->kernels[i]->nmaxobs;
  }

  float avguwind = 0, avgvwind = 0;
  /* float uzero = 0, vzero = 0;
     float ui, vi; */

  size_t k;

  float normalizing = 0;

  /* 1 for covfun-weighted average from initially-picked points, 2 for
     nearest neighbor (unstable and fluctuates fast, leading to
     temporally discontinuous fields) */
  const int wind_method = 1;

  if (E->use_wind) {
    if ((uwind_in) && (vwind_in)) {
      C->uwind = *uwind_in;
      C->vwind = *vwind_in;
    } else if (nobs1) {
      if (wind_method == 1) {
	for (k=0; k<nobs1; ++k) {
	  avguwind += cfvalues1[k]*S->u[obsind1[k]];
	  avgvwind += cfvalues1[k]*S->v[obsind1[k]];
	  normalizing += cfvalues1[k];
	}
	avguwind = avguwind/normalizing;
	avgvwind = avgvwind/normalizing;
	C->uwind = avguwind;
	C->vwind = avgvwind;
      } else if (wind_method == 2) {
	C->uwind = S->u[obsind1[0]];
	C->vwind = S->v[obsind1[0]];
      }
    }
  }
  if (DEBUG) {
    printf("uwind/vwind for grid point %zu is %f/%f\n ", r, C->uwind, C->vwind);
  }

  /* Temporary pointers for dealing with two state objects */
  struct state *Sptr1, *Sptr2;
  size_t oi1, oi2;

  /* Set prior covariance to get errors right */
  C->prior_var = prior_var;

  /* Fill the cholstruct with the prior covariances. We only fill the
     upper triangle of the covariance matrix C->K. */
  for (i=0; i<C->nobs; ++i) {
    /* Get observations from the right state and correct indexing */
    Sptr1 = ((i < nobs1) ? S : S_predicted);
    oiptr1 = ((i < nobs1) ? obsind1 : obsind2);
    oi1 = ((nobs1) ? i%nobs1 : i);

    for (j=i; j<C->nobs; ++j) {
      /* Get observations from the right state and correct indexing */
      Sptr2 = ((j < nobs1) ? S : S_predicted);
      oiptr2 = ((j < nobs1) ? obsind1 : obsind2);
      oi2 = ((nobs1) ? j%nobs1 : j);
      C->K[i*C->nobs+j] = (double)
	covfun(Sptr2->x[oiptr2[oi2]], Sptr2->y[oiptr2[oi2]],
	       Sptr2->z[oiptr2[oi2]], Sptr2->t[oiptr2[oi2]],
	       Sptr1->x[oiptr1[oi1]], Sptr1->y[oiptr1[oi1]],
	       Sptr1->z[oiptr1[oi1]], Sptr1->t[oiptr1[oi1]], E->cfc,
	       &C->uwind, &C->vwind);

      /* This line is the transpose lower triangle part of the
	 covariance matrix. LAPACKE does not use it. */
      /* C->K[j*C->nobs+i]  = C->K[i*C->nobs+j]; */
    }
    /* printf("avguwind, avgvwind: %f, %f\n", avguwind, avgvwind); */

    /* Fill Gaussian process covariances between the observations and
       the grid location where we evaluate the Gaussian process. */
    if (!(DEBUG-2)) {printf("covfun args: %f %f %f %f %f %f %f %f %f %f\n",
			    Sptr1->x[oiptr1[oi1]], Sptr1->y[oiptr1[oi1]],
			    Sptr1->z[oiptr1[oi1]], Sptr1->t[oiptr1[oi1]],
			    E->x[r], E->y[r], E->z[r], tt, u, v);}

    C->Kxx[i] = (double) covfun(Sptr1->x[oiptr1[oi1]], Sptr1->y[oiptr1[oi1]],
				Sptr1->z[oiptr1[oi1]], Sptr1->t[oiptr1[oi1]],
				E->x[r], E->y[r], E->z[r], tt, E->cfc,
				&C->uwind, &C->vwind);

    /* After this we have the data in those first ntotobs slots, the rest
       are still there, but useless. */
    C->dv[i] = (double) Sptr1->dv[oiptr1[oi1]];

    /* CAREFUL! Only for diagnostics/debug. If you use S_predicted !=
    NULL, then these values likely do something you do not want them
    to do. Should be ok if only S is used. */
    C->obsind[i] = oiptr1[oi1];

    if (!(DEBUG-2)) { /* Fixme debug level handling */
      printf("Kxx and dv:, %f, %f\n", C->Kxx[i], C->dv[i]);
      printf("Time diff: %e\n", Sptr1->t[oiptr1[oi1]] - tt);
    }

    /* Add the diagonal uncertainty from the observational data, and
       always add a little, to regularize. */
    C->K[i*(C->nobs+1)] += (double) fmax(1e-3, Sptr1->dv_unc[oiptr1[oi1]]);
  }

  /* Free the allocated index array */
  free(obsind1);
  free(obsind2);
  if (cfvalues1) {
    free(cfvalues1);
  }
  if (cfvalues2) {
    free(cfvalues2);
  }

  return C;
}
