#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <omp.h>
#include <time.h>

/* For reading input netcdf filenames */
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>
#include <limits.h>

#define LAPACK_DISABLE_NAN_CHECK = 0
#include <lapacke.h>
#include <cblas.h>

#include <gpstructs.h>
#include <gputils.h>
#include <covariance_functions.h>
#include <mean_functions.h>
#include <gpopt.h>

struct config create_config(char *area, double gridres, int maxdays, int gi_per_sweep,
			    float obs_dist, size_t mode, struct covfunconfig *cfc_compound,
			    float *refpoint_arr, int dependent_variable,
			    int *daylist, int ndays_in_daylist)
/* This function initializes the configuration of the model; selecting
   the area to look at, grid resolution in degrees, and the amount of
   days needed. We also set the mean distance of consecutive
   observations used (discarding when reading), and whether we we do a
   gridGP (mode = 0) or a model training (mode = ntestpoints). The
   refpoint_arr is an array where the reference points for
   optimization are read if mode == 0. If this is NULL, then a random
   set of points is generated. daylist is an array with indexes of
   days that are wanted to be simulated. If NULL, then everything up
   to maxdays will be computed. ndays_in_daylist gives the number of
   days in that list. */
{
  struct config E;
  int i, j;

  /* Set the grid info based on what area name was given. */
  if (!strncmp(area, "EastAsia", 24)) {
    E.minlat = 0; E.maxlat = 60; E.minlon = 245; E.maxlon = 325;
  } else if (!strncmp(area, "SouthAsia", 24)) {
    E.minlat = 0; E.maxlat = 20; E.minlon = 245; E.maxlon = 275;
  } else if (!strncmp(area, "TESTAREA", 24)) {
    E.minlat = -10.; E.maxlat = 10.; E.minlon = 170.; E.maxlon = 190.; /* 20x20 deg square */
  } else if (!strncmp(area, "Shanghai", 24)) {
    E.minlat = 30; E.maxlat = 33; E.minlon = 118+180; E.maxlon = 126+180;
  } else if (!strncmp(area, "Helsinki", 24)) {
    E.minlat = 57; E.maxlat = 63; E.minlon = 200; E.maxlon = 215;
  } else if (!strncmp(area, "ChinaSea", 24)) {
    E.minlat = 27; E.maxlat = 40; E.minlon = 295; E.maxlon = 325;
  } else if (!strncmp(area, "Europe", 24)) {
    E.minlat = 35; E.maxlat = 60; E.minlon = 170; E.maxlon = 222;
  } else if (!strncmp(area, "MiddleEast", 24)) {
    E.minlat = 11; E.maxlat = 37; E.minlon = 209; E.maxlon = 240;
  } else if (!strncmp(area, "EuropeAfrica", 24)) {
    E.minlat = 0; E.maxlat = 70; E.minlon = 160; E.maxlon = 240;
  } else if (!strncmp(area, "Africa", 24)) {
    E.minlat = 0; E.maxlat = 30; E.minlon = 160; E.maxlon = 240;
  } else if (!strncmp(area, "equator", 24)) {
    E.minlat = -10; E.maxlat = 10; E.minlon = 0; E.maxlon = 360;
  } else if (!strncmp(area, "NorthAmerica", 24)) {
    E.minlat = 0; E.maxlat = 60; E.minlon = 50; E.maxlon = 120;
  } else if (!strncmp(area, "World", 24)) {
    E.minlat = -85; E.maxlat = 85; E.minlon = 0; E.maxlon = 360;
  } else if (!strncmp(area, "Beijing", 24)) {
    E.minlat = 28; E.maxlat = 35; E.minlon = 298; E.maxlon = 307;
  } else {
    printf("Invalid area: %s", area); exit(1);
  }

  E.mfs = malloc(sizeof(struct mfstruct));

  /* gridGP gets the config with mode == 0. */
  if (mode == 0) {
    E.nlat = (int) ((E.maxlat - E.minlat)/gridres + 0.5);
    E.nlon = (int) ((E.maxlon - E.minlon)/gridres + 0.5);
    E.gridres = gridres;
    E.ngp = E.nlat*E.nlon;
    E.lat = malloc(sizeof(double)*E.nlat);
    E.lon = malloc(sizeof(double)*E.nlon);
    E.x = malloc(sizeof(double)*E.ngp);
    E.y = malloc(sizeof(double)*E.ngp);
    E.z = malloc(sizeof(double)*E.ngp);

    E.t = NULL; E.dv = NULL; E.dv_unc = NULL; E.u = NULL; E.v = NULL;
    for (i=0; i<E.nlat; i++) {
      E.lat[i] = E.minlat + (i + .5)*E.gridres;
      if (DEBUG) {
	printf("lat: %f\n", E.lat[i]);
      }
    }

    for (i=0; i<E.nlon; i++) {
      E.lon[i] = E.minlon - 180 + (i + .5)*E.gridres;
      if (DEBUG) {
	printf("lon: %f\n", E.lon[i]);
      }
    }
    /* Create the array of lat-lon coordinates in Cartesian coordinates */
    for (i=0; i<E.nlat; i++) {
      for (j=0; j<E.nlon; j++) {
	latlon_to_xyz(E.lat[i], E.lon[j], &E.x[i*E.nlon+j], &E.y[i*E.nlon+j], &E.z[i*E.nlon+j]);
      }
    }
  } else {
    /* For training the GP; most variables get phony values. */
    E.ngp = mode;
    E.lat = malloc(sizeof(double)*E.ngp);
    E.lon = malloc(sizeof(double)*E.ngp);
    E.x = malloc(sizeof(double)*E.ngp);
    E.y = malloc(sizeof(double)*E.ngp);
    E.z = malloc(sizeof(double)*E.ngp);
    E.t = malloc(sizeof(double)*E.ngp);
    E.dv = NULL;
    E.dv_unc = NULL;

    E.gridres = -1; E.nlat = -1; E.nlon = -1;

    /* Get reference points from an array if an array is given */
    if (refpoint_arr) {
      for (i=0; i<E.ngp; i++) {
	E.x[i] = refpoint_arr[i*4];
	E.y[i] = refpoint_arr[i*4 + 1];
	E.z[i] = refpoint_arr[i*4 + 2];
	E.t[i] = refpoint_arr[i*4 + 3];
      }
    }
    else {
      /* Get random points inside the domain to be the reference points. */
      fill_E_testdata_randomly(&E, maxdays);
    }
  }

  E.cfc = cfc_compound;
  if (E.cfc == NULL) {
    printf("Treating config as a gridspec => not initializing other variables.\n");
    return E;
  }

  /* Make sure the cfc, if given, is a compound kernel, even if it has
     just one member */
  assert(E.cfc->covftype < 0);

  E.dependent_variable = dependent_variable;

  /* Record whether we use wind information or not - by checking if
     any of the kernels are type 5 */
  E.use_wind = 0;
  for (i=0; i<-E.cfc->covftype; i++) {
    if (E.cfc->kernels[i]->covftype == 5) {
      E.use_wind = 1;
      break;
    }
  }

  /* If we are doing GP on wind vectors */
  if (E.dependent_variable) {E.use_wind = 1;}

  /* With 0.001 or less, no edges are seen in posterior mean with
     even thinning when creating cholesky.*/
  E.min_cov = 1e-9;

  /* Would be best to get this from the observation files but whatever. */
  E.first_day_noon_unix = 1410004800;
  E.maxdays = maxdays;

  /* This is an arbitrary describing how much space we give to each
     observation vector in the data grid. We realloc more as needed -
     at minimum this amount. The larger this is, the better our
     performance is, as we do not need to memmove as much. But of
     course that is more wasteful. Setting this to a small value will
     be size-wise optimal but slow. */
  E.malloc_chunk = 1024*128;

  set_max_distances_in_config(&E);

  /* Number of gridpoints we deal with in one sweep of data. This is
     still rounded to give something reasonable (to equalize the
     amount gridpoints in last sweep) */
  E.gi_inc = gi_per_sweep >= E.ngp ? E.ngp : E.ngp/(E.ngp/gi_per_sweep + 1) + 1;
  E.mode = mode;
  if (E.mode) { E.gi_inc = E.ngp; } /* For training mode, do all gridpoints at the same time */

  /*We need to thin the data in any case, so we get performance and
    consistence by doing it as early as possible. Randomness makes it
    so that we do not do unfair data selection by accident. */
  E.obs_dist = obs_dist;

  E.daylist = daylist;
  E.ndays_in_daylist = ndays_in_daylist;

  print_config(&E);

  return E;
}

void add_datapoint_to_S_for_calibration(struct config *E, struct state *S, float *lat, float*lon, float *t, float *u, float *v, float *dv, float *dv_unc) {
  /* Adds a single data point to S for calibration in the situation
     where E->mode > 0. This point can come from a daydata object, or
     it can come from sampling from the prior, where after predicting
     the GP at a certain location, we want to condition on that for
     the next ones.*/

  /* Do not use this with a regular grid. For that you should use
     add_datapoint_to_S() instead */
  assert(E->mode > 0);
  float *cfvals = malloc(E->ngp*sizeof(float));
  int add, dn;
  size_t oi, gi;
  double xd, yd, zd;
  float x, y, z;

  latlon_to_xyz(*lat, *lon, &xd, &yd, &zd);
  x = (float) xd;
  y = (float) yd;
  z = (float) zd;

  /* First calculate the costfunction values, since this way we can
     do it in parallel (adding them later needs to be done
     sequentially). This for-loop would be very nice for GPU... */
#pragma omp parallel for private(gi) shared(x,y,z,t,u,v,E) num_threads(NTHREADS) schedule(runtime)
  for (gi=0; gi<E->ngp; gi++) { /* Note that E->ngp is the number of training points */
    cfvals[gi] = covfun(x, y, z, *t, E->x[gi], E->y[gi], E->z[gi], E->t[gi], E->cfc, u, v);
  }
  add = 0; /* We set this flag to 1 when an observation needs to be added to S */
  oi = S->nobs; /* stands for "observation index" */
  for (gi=0; gi<E->ngp; gi++) { /* Note that E->ngp is the number of training points */
    if (cfvals[gi] > E->min_cov) {
      /* This is a trivial function so no need to optimize */
      dn = daynumber_from_t(E->t[gi]);
      /* Get more space if needed */
      if (S->griddaycounts[gi][dn] == S->allocated[gi][dn]) {
	size_t newsize = S->allocated[gi][dn] + 128;
	size_t ns = newsize*sizeof(size_t);
	S->gridobsind[gi][dn] = (size_t*) reallocate_S_element(S->gridobsind[gi][dn], ns);
	S->allocated[gi][dn] = newsize;
      }
      /* If the obs is close to the reference point, we add it to state */
      S->gridobsind[gi][dn][S->griddaycounts[gi][dn]] = oi;
      S->griddaycounts[gi][dn]++;
      add = 1;
    }
  }

  /* Check if we need more space */
  if (S->nobs == S->allocated_for_all_obs) {
    reallocate_S(S, E, E->malloc_chunk);
  }

  /* If the datapoint was close to any reference point in E,
     references to it are added to S. */
  if (add) {
    S->nobs++;
    S->x[oi] = x;
    S->y[oi] = y;
    S->z[oi] = z;
    S->dv[oi] = *dv;
    S->dv_unc[oi] = *dv_unc;
    S->t[oi] = *t;
    if (E->use_wind) {
      S->u[oi] = *u;
      S->v[oi] = *v;
    }
  }

  free(cfvals);
}

void add_daydata_for_calibration(struct daydata *D, struct state *S, struct config *E) {
  /*
    Add data to state arrays for calibration from D. This is
    different from the normal procedure in add_data_for_calibration
    in that any observation close to any point is added to the
    observation set of that "grid index" gi, which are in this case
    just the individual observations.
  */

  size_t i;
  float *lat, *lon, *dv, *dv_unc;
  float *u = NULL;
  float *v = NULL;
  float  t;

  for (i=0; i<D->ngobs; i++) {
    dv = &D->data[0][i];
    dv_unc = &D->data[1][i];
    t = (float) (D->data[2][i]  - 1400000000);
    if (E->use_wind) {
      u = &D->data[3][i];
      v = &D->data[4][i];
    }
    lat = &D->lat[i];
    lon = &D->lon[i];

    add_datapoint_to_S_for_calibration(E, S, lat, lon, &t, u, v, dv, dv_unc);
  }
}

void add_data_optimized(struct daydata *D, struct state *S, struct config E) {
  /* Add data to state arrays from the daydata object */

  size_t i;
  float *u = NULL;
  float *v = NULL;

  for (i=0; i<D->ngobs; i++) {
    if (E.use_wind) {
      u = &D->data[3][i];
      v = &D->data[4][i];
    }
    add_datapoint_to_S(&D->lat[i], &D->lon[i], &D->data[2][i], &D->data[0][i], &D->data[1][i], u, v, &E, S);
  }

  /* We added a full day; usually no more is added later so we can
     just shrink arrays to minimum size. */
  int dn = daynumber_from_t(D->data[2][0]);
  for (i=0; i<E.ngp; i++) {
    S->gridobsind[i][dn] = (size_t*) reallocate_S_element(S->gridobsind[i][dn], S->griddaycounts[i][dn]*sizeof(size_t));
    S->allocated[i][dn] = S->griddaycounts[i][dn];
  }
}

void add_datafile(struct state *S, struct config *E, char *fname) {

  struct daydata *D;

  /* These variables are used to speed up data reading in readnetCDF
     and their values depend on what the current coordinates that are
     swept over are. Tolerances are added in readnetCDF(), so these
     shoud be just the min and max latitudes that we are interested
     in. Longitudes don't matter since each sweep has all the
     longitudes between E.min_lon and E.max_lon.*/
  float dataread_minlat, dataread_maxlat;

  /* For gridGP, we calculate the min and maxlats according to gi0 and
     gi1. For find_parameters(), we read all latitudes in the area in
     one go. */
  if (!E->mode) {
    dataread_minlat = E->lat[S->gi0/E->nlon];
    dataread_maxlat = E->lat[(S->gi1-1)/E->nlon]; /* -1 needed to avoid getting 0 */
  } else {
    dataread_minlat = E->minlat;
    dataread_maxlat = E->maxlat;
  }

  if (DEBUG) {printf("Adding file: minlat, maxlat: %f, %f\n",
		     dataread_minlat, dataread_maxlat);}

  D = readnetCDF(fname, E, dataread_minlat, dataread_maxlat);

  /* Skip file altogether if no good data is found */
  if ((D->ngobs > 0) && (E->mode == 0)) {
    add_data_optimized(D, S, *E);
    if (1) { /* Here we save the data to a file */
      save_chosen_daydata_to_txt(D, daynumber_from_t(D->data[2][0]));
    }
  } else if (E->mode) {
    add_daydata_for_calibration(D, S, E);
  }
  teardown_daydata(D);
}

void add_all_datafiles_in_dir(struct state *S, struct config *E, char *dirname, uint maxfiles) {

  struct dirent **filelist;
  int n, m;
  size_t i;

  m = strlen(dirname);
  if (DEBUG) {
    printf("dirname: %s\n", dirname);
    printf("E.maxdays: %zu\n", E->maxdays);
  } else {
    printf("===Adding files===\n");
  }

  n = scandir(dirname, &filelist, 0, alphasort);
  if (n<0) {
    perror("scandir error");
  }
  else {
    for (i=2; i<n; i++) { // The first two files scandir gets are "." and ".."
      if (DEBUG) {
	printf("\n============= adding file %zu =============\n", i);
	printf("datafilename: %s\n", filelist[i]->d_name);
      }
      else {
	printf("Adding file %zu/%d, ", i-1, n-2);
	fflush(stdout);
      }
      dirname[m] = '\0';
      strncat(dirname, filelist[i]->d_name, 256);
      add_datafile(S, E, dirname);
      if (i>maxfiles+1) {break;}
    }
  }

  /* Restore original length of dirname for future data reading*/
  dirname[m] = '\0';
  printf("\n");

  /* Free filelist */
  while (n--) {
    free(filelist[n]);
  }
  free(filelist);

  printf("Total number of observations in S: %zu\n", S->nobs);

  /* Build this so that we know where the obs are */
  rebuild_S_obs_grid_mask(S, E);
}

double predict(struct cholstruct *C, float *result_dv, float *result_dv_unc) {
  /* Calculates the Gaussian process based on the information in the
     cholstruct C. The cfconf argument is needed just for adding the
     constant to the uncertainty. */

  size_t i;
  double loglik = 0;

  /* Penalize for no observations. */
  if (C->nobs == 0) {
    printf("No obs! ");
    return -100.;
  }

  /* Let T = K + sigma_n^2*I_n. The log likelihood calculated with
     log(p(y|X)) = -.5 y^T T^-1 y - .5 log|T| - .5n log(2pi)       (1)
     See Rasmussen, Eq. (2.30).

     The posterior mean at x* is:
     f* = Kxx T^-1 y, (2)

     where Kxx = C->Kxx is the covariance vector between training
     points and the testing point, see Rasmussen, (2.25)

     The posterior variance is calculated from

     Var[f*] = k(x*,x*) - Kxx^T T^-1 Kxx, (3)

     where k(x*,x*) is the prior variance. See Rasmussen, (2.26)
  */

  // loglik = -.5*C->nobs*log(2*M_PI); /* Last term in (1) */
  loglik -= 0; /* Cutoff should affect only predictions, not likelihood */

  /*
    write_1d_array_to_txt("Kxx", C->Kxx, 1, C->nobs, 2);
    write_1d_array_to_txt("dv", C->dv, 1, C->nobs, 2);
    write_1d_array_to_txt("original_cov", C->K, 1, C->nobs*C->nobs, 2);
    print_matrix(C->K, C->nobs, C->nobs, "original_cov", 2);
    printf("loglik 1: %f\n", loglik);
  */

  /* Calculate cholesky decomposition of the covariance K */
  LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'U', (lapack_int)C->nobs, C->K, (lapack_int)C->nobs);

  /*
    print_matrix(C->K, C->nobs, C->nobs, "cholesky_of_cov", 2);
    write_1d_array_to_txt("cholesky_of_cov", C->K, 1, C->nobs*C->nobs, 2);
  */

  /* Add to loglik the log diagonal elements, term 2 in (1) */
  for (i=0; i<C->nobs; i++) {
    loglik -= log(C->K[i*(C->nobs + 1)]);
  }
  /* Invert the covariance, to get T^-1 in (1) and (2) */
  LAPACKE_dpotri(LAPACK_ROW_MAJOR, 'U', (lapack_int)C->nobs, C->K, (lapack_int)C->nobs);
  /*
  printf("loglik 2: %f\n", loglik);
  print_matrix(C->K, C->nobs, C->nobs, "inv_of_cholesky_of_cov", 2);
  write_1d_array_to_txt("inverse_of_cholesky_of_cov", C->K, 1, C->nobs*C->nobs, 2);
  */

  /*
    Compare two ways of generating the weights - different ordering of
    the matrix multiplication in calculating the covariance. These
    have been tested to be equal - but needed the transposition to be
    uncommented in create_cholstruct().

    double first = 0, second = 0;
    double *test = calloc(C->nobs, sizeof(double));
    cblas_dsymv(CblasRowMajor, CblasUpper, C->nobs, 1., C->K, C->nobs, C->Kxx, 1, 0, test, 1);
    print_vector(test, C->nobs, "weights", 2);
    for (i=0; i<C->nobs; i++) {
      printf("first method: ADD: %f\n", test[i]*C->dv[i]);
      first += test[i]*C->dv[i];
    }
    free(test);
  */

  /* This does T^-1 y, which is the latter part of (2) and first term
     of (1). Output vector goes to C->x */
  cblas_dsymv(CblasRowMajor, CblasUpper, C->nobs, 1., C->K, C->nobs, C->dv, 1, 0, C->x, 1);

  /* This does T^-1 Kxx, which is in (3). Output vector goes to tmp. */
  double tmp[C->nobs];
  cblas_dsymv(CblasRowMajor, CblasUpper, C->nobs, 1., C->K, C->nobs, C->Kxx, 1, 0, tmp, 1);

  /*
    print_vector(tmp, C->nobs, "tmp_vector", 2);
    print_vector(C->x, C->nobs, "T_inv_times_dv", 2);
    print_vector(C->dv, C->nobs, "XCO2", 2);
  */

  /* Starting values for these */
  *result_dv = (float) 0;
  *result_dv_unc = (float) C->prior_var;

  /* Finish calculation of all the variables */
  for (i=0; i<C->nobs; i++) {
    loglik -= .5*(C->dv[i])*(C->x[i]); /* First term in (1) */
    *result_dv += (float) C->Kxx[i]*C->x[i]; /* Left multiplication in (2) */
    *result_dv_unc -= (float) C->Kxx[i]*tmp[i]; /* Second term in (3) */
  }

  assert(*result_dv_unc > 0);

  return loglik;
}


void subtract_meanfun_from_obs_in_cholstruct(struct state *S, struct cholstruct *C,
					     struct config E) {
  /* Subtracts the desired meanfunction from the observations in a
     cholstruct. Effectively, this function exists to use a previous
     dynamic GP as a mean function to a time-independent GP. The
     previous GP needs to have its results in the mfs->arr1d 

     CHECKME NOT TESTED AFTER REFACTORING. */

  size_t j, oi;
  float diffdv;

  for (j=0; j<C->nobs; j++) {
    /* Get dynamically calculated values, by interpolating, for
       subtraction from each observation */
    oi = C->obsind[j]; /* shorthand for readability */
    diffdv = meanfun(S->x[oi], S->y[oi], S->z[oi],
		       S->t[oi], E.mfs);
    C->dv[j] -= diffdv;
  }
}


double single_point_dyn_GP(size_t r, float tt, struct state *S,
			   struct config E, float *result, float *result_unc, float *uwind, float *vwind, struct state *S_auxiliary) {

  struct cholstruct *C;
  float diffdv;
  size_t j;
  double loglik; /* Start at zero */
  float* uwind_in = NULL;
  float* vwind_in = NULL;

  loglik = -5000;

  if (E.cfc->covftype == 2) {
    printf("covftype cannot be 2 for dynamic GP! Exiting now.\n");
    exit(249);
  }

  /* If we have gridded winds given in S, then use them */
    if ((S->uwinds_gridded) && (S->vwinds_gridded)) {
      int dn = daynumber_from_t(tt);
      uwind_in = S->uwinds_gridded+(dn*E.ngp +r);
      vwind_in = S->vwinds_gridded+(dn*E.ngp +r);
    }

  /* Calculates a single point's dynamic GP */
    C = create_cholstruct(S, &E, r, tt, S_auxiliary, 0, uwind_in, vwind_in);
  /* Subtract the mean function at the desired location from all
     the observations. Uncertainties should not get this
     treatment.*/
  diffdv = meanfun(E.x[r], E.y[r], E.z[r], tt, E.mfs);
  if (!isnormal(diffdv)) {
    printf("Wait! NaN in diffdv, this should not happen!\n");
  }

  for (j=0; j<C->nobs; j++) {
    C->dv[j] -= diffdv;
  }

  /* Run the Gaussian Process formulas for the current point */
  if (C->nobs > 0) {
    loglik = predict(C, result, result_unc);

    /* This section is for redoing the calculation based on some
       condition. It was used for debug, but might come in handy
       later. Or can be just removed...

    int iii = 0;
    while ((E.use_wind) && (fabs(*result-400) > 100) && (iii< 100))  {
      C = create_cholstruct(S, &E, r, tt);
      for (j=0; j<C->nobs; j++) {
	C->dv[j] -= diffdv;
      }
      loglik = predict(C, result, result_unc);
      iii++;
    }
    */

    /* printf("LL: %f", loglik); */
    S->total_choleskys++;
    /* Add back the subtracted meanfunction value */
    *result += diffdv;
  } else {
    /* If we do not have any observations, we say that our
       prediction is just our mean function with a large
       uncertainty */
    *result = diffdv;
    *result_unc = C->prior_var; /* FIXME SHOULD WE HAVE THE PRIOR VARIANCE HERE? SHOULD WE TAKE THE PRIOR VAR FROM THE FIRST GP HERE? */
  }
  if (!(isnormal(*result))) {
    printf("NaN from cholesky, replacing with meanf'n value. This should be extremely rare!\n");
    printf("result = %f, meanf = %f, nobs = %zu\n", *result, diffdv, C->nobs);
    print_matrix(C->K, C->nobs, C->nobs, "FAILED MATRIX", 2);
    /* These should be just the prior values, as otherwise generating
       synthetic data starts from odd values The right values are set
       above already.

       *result = diffdv;
       *result_unc = 10;
    */
    S->nancount++;
  }
  if (E.use_wind) {
    *uwind = C->uwind;
    *vwind = C->vwind;
  }
  teardown_cholstruct(C);

  //printf("LL2: %f", loglik);
  return loglik;
}

double loss(struct state *S, struct config *E) {
  /* Calculate the loss function for a set of points in E; the GP mean
     function and covariance function parameters are given also in
     E. */

  size_t r;
  float *res = malloc(E->ngp*sizeof(float));
  float *unc = malloc(E->ngp*sizeof(float));
  double loss;

  float uwind, vwind;

  loss = 0;

#pragma omp parallel for private(r) shared(res,unc,loss,S,E) num_threads(NTHREADS) schedule(runtime)
  for (r=0; r<E->ngp; r++) {
    /* Only dynamic GP is ok here */
    // printf("Calculating for point %zu\n", r);
    loss += single_point_dyn_GP(r, E->t[r], S, *E, &(res[r]), &(unc[r]), &uwind, &vwind, NULL);
  }
  // print_vector(res, E->ngp, "Predicted dv in loss", 1);
  // print_vector(unc, E->ngp, "Predicted dv_unc in loss", 1);
  free(res);
  free(unc);

  /* We minimize loss - meaning maximize likelihood; and therefore one more minus sign */
  if (!isnormal(loss)) loss = -1e9;
  return -loss;
}

void gridGP(struct state *S, struct config E) {

  /* Calculates the Gaussian Process in a grid, based on
     arguments. The process should be zero-mean, so a meanfunction
     given in cfconf is subtracted first. For the static case, the
     meanfunction is subtracted from all observations separately. For
     the dynamic cases, the mean function value subtracted is the same
     for all observations involved in the evaluation of the GP at a
     certain phase-space point.

     The result arrays are where the results go, and arr_mf is a
     pointer to a previous GP if that is wished to be used as a mean
     function for a new GP wit the appropriate cfconf->meanftype. */

  int i;
  long int r, ri, counter;
  float tt; /* Time coordinates */

  struct cholstruct *C;

  printf("====Calculating gridGP====\n");
  // counter = S->gi0;

  /* One day at a time _may_ or may not be faster when days are far from each other. */
  int batchsize_days = 1;
  int current_offset = 0;
  int maxday;

  float *results = malloc(E.ngp*batchsize_days*sizeof(float));
  float *results_unc = malloc(E.ngp*batchsize_days*sizeof(float));

  float *u_winds = NULL;
  float *v_winds = NULL;

  if (E.use_wind) {
    u_winds = malloc(E.ngp*batchsize_days*sizeof(float));
    v_winds = malloc(E.ngp*batchsize_days*sizeof(float));
  }
  float u;
  float v;

  /* FIXME OUTPUTTING WHEN USING SUBDOMAINS WILL NEED POSTPROCESSING
     OR SEVERAL FILES */

  /* Number of days to be simulated */
  int total_days = (E.daylist) ? E.ndays_in_daylist: S->total_days;

  while (current_offset < total_days) {
	maxday = current_offset + batchsize_days;
	maxday = (maxday < total_days) ? maxday : total_days;
    /* gi runs over _fake_ grid indexes, from 0 to E.gi_inc, and r
       runs over the real indexes */
    printf("Computing days %d - %d\n", current_offset, current_offset + batchsize_days);
    counter = 0;
    #pragma omp parallel for private(C,r,ri,i,tt) shared(S,results,results_unc) num_threads(NTHREADS) schedule(runtime)
    for (r=S->gi0; r<S->gi1; r++) {
      counter +=1; /* Only for printing on the following line */
      printf("GP calculation for gridpoint: %li/%zu...  \r", counter, E.ngp);
      // printf("This is index: %li...  \n", r);
      fflush(stdout);

      if (E.cfc->kernels[0]->covftype == 2) {
	/* Static GP: only spatial distance in the GP kernel. Requires
	   that meanfunction is subtracted from all the observations
	   separately, as they can be far apart in time. Note that
	   only one kernel will be evaluated in this case. No gridded
	   winds can be used with static GP (it probably would work
	   but does not make sense)*/
	C = create_cholstruct(S, &E, r, S->first_day_noon, NULL, 0, NULL, NULL);
	subtract_meanfun_from_obs_in_cholstruct(S, C, E);

	if (C->nobs > 0) {
	  /* FIXME Penalty term is zero so deficient kernels are not
	     penalized - on the other hand there will likely be enough
	     data to fill the kernel since all of the local timeseries
	     is considered */
	  predict(C, &results[r], &results_unc[r]);
	  S->total_choleskys++;
	}
	teardown_cholstruct(C);
	current_offset = S->total_days; // Just to not repeat the while-loop.
      } else {
	/* GP calculated for each day separately */
	//#pragma omp parallel for private(ri,i,tt,u,v) shared(S,E,r,current_offset,maxday,results,results_unc,u_winds,v_winds) num_threads(NTHREADS) schedule(runtime)
	for (i=current_offset; i<maxday; i++) {
	  /* N.B. maxday is how manyth day is the last if daylists are used */
	  int j = (E.daylist) ? E.daylist[i] : i;
	  tt = S->first_day_noon + j*24*3600;
	  ri = (i - current_offset)*E.ngp + r; /* result index in the result array */
	  single_point_dyn_GP(r, tt, S, E, &results[ri], &results_unc[ri], &u, &v, NULL);
	  if (E.use_wind) {
	    u_winds[ri] = u;
	    v_winds[ri] = v;
	  }
	}
      }
    }
    /* output when done. For E->mode != 0 and  loss(), this does not make sense */
    write_1d_array_to_txt("gp_mean.txt", results, maxday - current_offset, E.ngp, 1);
    write_1d_array_to_txt("gp_unc.txt", results_unc, maxday - current_offset, E.ngp, 1);
    if (E.use_wind) {
      write_1d_array_to_txt("u_winds.txt", u_winds, maxday - current_offset, E.ngp, 1);
      write_1d_array_to_txt("v_winds.txt", v_winds, maxday - current_offset, E.ngp, 1);
    }
    current_offset += batchsize_days;;
  }
  printf("\n");

  free(results);
  free(results_unc);

  if (E.use_wind) {
    free(u_winds);
    free(v_winds);
  }
}

void set_parameters(struct covfunconfig *cfc, float new_llat, float new_llon, float new_lt, float new_tau, float new_rho) {
  cfc->llat = new_llat;
  cfc->llon = new_llon;
  cfc->tau = new_tau;
  cfc->invllat2 = pow(new_llat, -2);
  cfc->invllon2 = pow(new_llon, -2);
  /* for the wind informed kernel, we get length scale from new_llat only. */
  if (cfc->covftype == 5) {
    cfc->rho = new_rho;
  }
  /* For a periodic kernel the lt argument actually means here the
     lperiodic, which is the width of the periodic covariance */
  if (cfc->covftype == 3) {
    cfc->lperiodic = new_lt;
    cfc->invlperiodic2 = 1./new_lt/new_lt;
  } else {
    cfc->lt = new_lt;
    cfc->invlt = 1/new_lt;
  }
}

void fill_E_testdata(struct config *E, float *testdata) {
  /* When doing training, this function fills the information about
     the points where we want to evaluate the GP. An extra feature are
     the t, dv, and dv_unc fields, which are not used for
     gridGP */

  size_t i;
  float lat, lon;

  for (i=0; i<E->ngp; i++) {
    lat = testdata[i*5];
    lon = testdata[i*5+1];

    latlon_to_xyz(lat, lon, &E->x[i], &E->y[i], &E->z[i]);

    E->t[i] = testdata[i*5+2];
    E->dv[i] = testdata[i*5+3];
    E->dv_unc[i] = testdata[i*5+4];
  }
}

float *sample_from_GP(struct config *E, struct state *S) {

  /* This function generates samples from a GP according to a grid
     specification in E, using observations read into state S (if
     any). The following scenarios are possible with this function:

     1. We have observations from e.g. OCO-2 and we want to predict in
     a grid. This would be like calculating a gridGP, but instead of
     recording the mean and uncertainty, we draw from the distribution
     defined by them and then add that point to S_predicted, so that
     it will be used for generating the cholstruct for the next
     prediction - effectively meaning that we condition the next
     evaluation on the previous one.

     2. We have observations and want to predict random points,
     specified by spec E. This works identically to #1, but now
     E->mode != 0 and S->grid?[i] refers to the i'th prediction's
     location instead of index of a regular grid location.

     3. We have no observations, and we generate from the prior a grid
     of points based on spec E. Same as #1, but now there are no
     observations a priori in S.

     4. We have no observations and we generate a random set of points
     based on spec E. Same as #2, without observations a priori in S.

     In short: When we want to generate realizations of the GP (sample
     random functions), for videos or so, we use #1. When we need to
     generate random synthetic data, for e.g. testing maximum
     likelihood estimation, we use #2. When we want to sample from the
     prior for a video, we do #3, and when we want to sample prior at
     random locations, we do #4.

     For generating more realistic synthetic data, all you need to do
     is to take the result array of this and then add noise, and
     record that noise in another array.

     This function does NOT address the situation where we have a
     pre-computed GP, and we just want to generate realizations based
     on that. For that, a function filling a state object from the
     result and result_unc arrays would be needed, but that is not
     much work.
  */

  size_t i, j, J;
  float tt, l1, l2;
  float *newdata;
  float res_mean, res_unc, res_sampled;
  float zf = 0; /*  For passing zero floats to functions when needed */
  struct boxmullerstruct *BM; /* For getting normally distributed random numbers */
  struct state *S_predicted = NULL; /* For adding predicted values */

  BM = initialize_boxmullerstruct();
  S_predicted = initialize_state(S_predicted, E, 0, E->ngp, 0.0001);

  /* We iterate so that we predict here and there all the time */
  size_t ndata = E->ngp * E->maxdays;
  size_t stride = get_largest_prime_under(ndata*9/10);
  size_t count = 0;
  size_t ind;

  /* Rands below are for testing that number of threads does not
     affect results. Prior sampling on ChinaSea region for 100 days
     produced max diffences smaller than 0.02 ppm (mean -0.012 ppm and
     std 0.022). This is due to that order of the sampling is
     different. Setup was 100 days of ChinaSea with 0.5deg res.

     WARNING: Posterior simulation results depend on the thread count
     here. This is not the case when add_datafiles... is commented out
     in main(). The reason might be that asynchronous addition of
     predictions leads to thinning done differently. The posterior
     fields are still quite similar, but some "Gaussian bumps" are
     bigger in some simulations than other. The variability of picking
     rands randomly is visually similar than the variability resulting
     from varying the number of threads. When thread count is
     constant, the results are repeatable - to the extent that the
     prior sampling is when nthreads is changed e.g. from 1 to 12. It
     would be nice to fully understand this, though..
  */


  float *normalrands = calloc(E->ngp*E->maxdays, sizeof(float));
  for (i=0;i<E->ngp*E->maxdays;i++) {
    normalrands[i] = normal(0, 1, BM);
  }

  /* Hide cursor so that it does not annoyingly blink */
  fputs("\e[?25l", stdout);

  /* Needed for call interface even if these are not used nor
     recorded */
  float uwind = 0;
  float vwind = 0;
  if (E->mode == 0) { /* This is when we generate a regular grid */
    newdata = malloc(E->ngp*E->maxdays*sizeof(float));
#pragma omp parallel for private(i,j,J,tt,l1,l2,res_mean,res_unc,ind) shared(S,E,S_predicted,newdata,BM,count,stride,ndata,zf) num_threads(NTHREADS) schedule(runtime)
    for (J=0; J<ndata; J++) {
      ind = (stride*J)%ndata;
      j = ind%E->ngp; /* grid index */
      i = ind/E->ngp; /* day index */
      tt = S->first_day_noon + i*24*3600;
      single_point_dyn_GP(j, tt, S, *E, &res_mean, &res_unc, &uwind, &vwind, S_predicted);
      /* newdata[ind] = normal(res, sqrt(res_unc), BM); */
      newdata[ind] = res_mean + normalrands[ind]*sqrt(res_unc);
      l1 = (double) E->lat[j/E->nlon];
      l2 = (double) E->lon[j%E->nlon];

      count++;
      if (!(count%10)) {
	printf("\rProgress: %f%% (%zu/%zu)               ", (float) count / (float) ndata * 100, count, ndata);
      }

#pragma omp critical
      add_datapoint_to_S(&l1, &l2, &tt, &(newdata[ind]), &zf, &zf, &zf, E, S_predicted);
    }
    write_1d_array_to_txt("data_from_gridded_GP_sample.txt", newdata, E->maxdays, E->ngp, 1);
  } else if (E->mode > 0) { /* Here we create a random sample of data */
    /* Only latitude, longitude, time, and dv are
       recorded. Uncertainty plays no role, and winds are zero for
       now. Of course winds could be set in create_cholstruct...*/
    newdata = malloc(4*E->ngp*sizeof(float));
    for (j=0; j<E->ngp; j++) {
      single_point_dyn_GP(j, E->t[j], S, *E, &res_mean, &res_unc, &uwind, &vwind, S_predicted);
      res_sampled = normal(res_mean, sqrt(res_unc), BM);
      tt = (float) E->t[j]; /* Need float pointer conversion */
      l1 = E->lat[j]; l2 = E->lon[j]; /* Need double conversion for add_datapoint_to_S() */
      /* Note: uncertainty is still zero here. Noise needs to be added later if wanted */
      add_datapoint_to_S_for_calibration(E, S_predicted, &l1, &l2, &tt,
					 &zf, &zf, &res_sampled, &zf);
      newdata[j*4] = E->lat[j];
      newdata[j*4+1] = E->lon[j];
      newdata[j*4+2] = E->t[j];
      newdata[j*4+3] = res_sampled;
      /* FIXME added wind above should be the avgwind from
	 create_cholstruct when wind kernel is used. But this is maybe
	 difficult in practice. Can we discard zero-wind entries in
	 cholstruct creation? */
    }
    write_1d_array_to_txt("data_from_random_GP_sample.txt", newdata, E->ngp, 4, 1);
  }
  fputs("\e[?25h", stdout); /* show cursor again */
  teardown_state(S_predicted, E);
  return newdata;
}
