/* This file contains mean function specifications for the Gaussian
   Process and associated routines, including finding local regression
   coefficients for the mean function. */

struct mfstruct {

  /* meanfunction is the function to be subtracted from the data
     values (e.g. dv) to make the field close to zero - so that we
     can calculate it as a zero-mean GP. The possible values are:

     0 - subtract nothing,
     1 - subtract a global mean, from a curve fitted to OCO-2 data
     2 - subtract an interpolation of results from a previous GP run
     3 - like 1, but with location-dependent beta coefficients for
	 the function global_mean_on_day() below
  */
  int mftype;

  /* mean function to be used and its gradient. The first argument of
     the function is time, and the second is the set of beta
     coefficients. */
  float (*meanfunc)(float, float*);

  /* number of beta coefficients for the function, see
     e.g. global_mean_on_day call interface 

     FIXME Is this still needed? Possibly for optimization? */
  uint ncoeff;

  /* With mftype == 1, arr_1d holds the coefficients beta that are
     constant everywhere.

     With mftype == 2, when a previous GP or other array is used as a
     mean function, then this is a pointer to that result array
  */
  float *arr_1d;

  /* Despite the name, this is a flattened 1d coefficient array: for a
     Markov random field we use this to access the beta coefficients
     for each location. The outer indexes are the ncoeff beta
     coefficients and the inner ones the grid points. This means that
     to access the second beta coefficient of grid index gi happens
     with mfs->arr_2d[mfs->refconfig->ngp + gi].
  */
  float *arr_2d;
  float *arr_2d_prec; /* For storing marginal mean function parameter
			precision matrices when optimizing */

  /* config object containing the grid specification to be used in
     case of mean function from a previous GP iteration (or other
     field) given in arr_2d. */
  struct config *refconfig;

  /* These are the limits for optimizing parameters of the
     meanfunc(). Their length is ncoeff.

  NOTE! For any meanfunction betas that are calibrated with the
  regularized least-squares approach (method == 2 in fitting betas
  with unc), those betas multiplying basis functions need to be first,
  and the phase etc. parameters that do not conform to the F(x)^T beta
  form need to be last! */

  double *beta_high, *beta_low;
};

void initialize_mfstruct(struct mfstruct *mfs, int mftype, float (*meanfunc)(float, float*),
			 uint ncoeff, double beta_low[], double beta_high[],
			 float *arr_1d, float *arr_2d, float *arr_2d_prec, struct config *refconfig) {

  mfs->beta_low = NULL;
  mfs->beta_high = NULL;
  mfs->mftype = mftype;
  mfs->refconfig = refconfig;

  switch (mftype) {
  case 0 : /* Subtract zero */
    break;
  case 1 : /* Subtract a global value from a fitted function */
    mfs->meanfunc = meanfunc;
    mfs->ncoeff = ncoeff;
    mfs->arr_1d = arr_1d;
    break;
  case 2: /* Subtract an array - such as previous GP stage results */
    mfs->arr_1d = arr_1d;
    break;
  case 3: /* Array of local values */
    mfs->meanfunc = meanfunc;
    mfs->ncoeff = ncoeff;
    mfs->arr_2d = arr_2d;
    mfs->arr_2d_prec = arr_2d_prec;
    mfs->beta_low = beta_low;
    mfs->beta_high = beta_high;
    break;
  }
  if (beta_low && beta_high) {
    for (int l=0; l<5; l++) {
      printf("%g, %g\n", mfs->beta_low[l], mfs->beta_high[l]);
    }
  }
}

float mf_from_previous_GP(float x, float y, float z, float daynumber, struct mfstruct *mfs) {
  /* Take mean function values from a previous Gaussian
     process. Refconfig should point to the previous GP's grid
     spec. */

  float lat, lon;
  int dn = (int) daynumber;
  uint latidx, lonidx;

  xyz_to_latlon(x, y, z, &lat, &lon);
  latlon_to_latidxlonidx(lat, lon, &latidx, &lonidx, mfs->refconfig);

  /* If a point between calculated points is asked for, the values
     will be interpolated. */
  if ((fabs(mfs->refconfig->lat[latidx] - lat) < 1e-4) &&
      (fabs(mfs->refconfig->lon[lonidx] + 180 - lon) < 1e-4)) {
    return mfs->arr_1d[dn*mfs->refconfig->ngp + latidx*mfs->refconfig->nlon + lonidx];
  } else {
    return interp_xyzcoords(x, y, z, mfs->arr_1d + dn*mfs->refconfig->ngp, mfs->refconfig);
  }
}

float mf_from_local_betas(float x, float y, float z, float daynumber, struct mfstruct *mfs) {
  /* This is a Markov random field-type case, where we have local
     betas for each spatial location. We find the grid index of xyz
     and then use the associated betas in mfs->arr_2d. If needed, the
     values will be interpolated according to mfs->refconfig */

  size_t i;
  float *coeff = malloc(mfs->ncoeff * sizeof(float));
  float val;

  for (i=0;i<mfs->ncoeff;i++) {
    coeff[i] = interp_xyzcoords(x, y, z, &mfs->arr_2d[i*mfs->refconfig->ngp], mfs->refconfig);
  }

  val = mfs->meanfunc(daynumber, coeff);
  free(coeff);
  return val;

}

float global_mean_on_day(float daynumber, float *beta) {
  /* This is a mean function for the Gaussian process. The functional
     form fits global means of OCO-2 XCO2 data relatively well. See
     meanfun() for coefficients beta used in that case. */

  float x = daynumber/365.25*2*M_PI;
  return beta[0]*sinf(x + beta[4]) + beta[1]*cosf(2*x + beta[4]) + beta[3]*x + beta[2];
}

void grad_helper_global_mean_on_day(float daynumber, float *beta, double *grad) {
  /* Calculate the gradient of the function global_mean_on_day. Note
     that this still needs to be multiplied with twice the return
     value of global_mean_on_day(). */
  float x = daynumber/365.25*2*M_PI;
  grad[0] = -sinf(x + beta[4]);
  grad[1] = -cosf(2*x + beta[4]);
  grad[2] = beta[0]*cosf(x + beta[4]) + beta[1]*sinf(2*x + beta[4]);
  grad[3] = (double) -x;
  grad[4] = -1;
  /* printf("%f, %f, %f, %f, %f\n", grad[0], grad[1], grad[2], grad[3], grad[4]); */
}

float meanfun(float x, float y, float z, float tt, struct mfstruct  *mfs) {
  /* This is the mean function of the Gaussian process, which will be
     calculated according to the specification given in *mfs.  The
     values are subtracted from the observations (in e.g. gridGP()) to
     make the GP closer to zero-mean. */

  /* dn for daynumber: how manyth day after first day */
  float val, dn;

  /* S first day noon is 10004800. FIXME REMOVE THIS CONSTANT, TIME
     SHIFTING SHOULD BE NORMALIZED TO FIRST DAY STARTING FROM 0 WHEN
     DATA IS READ */
  dn = (tt - 10004800 + 43200.)/86400;

  switch (mfs->mftype) {
  case 0 : /* Subtract zero */
    val = 0;
    break;
  case 1 : /* Subtract a global value from a fitted function FIXME */
    val = mfs->meanfunc(dn, mfs->arr_1d);
    break;
  case 2 : /* Subtract a value from a previous time-dependent Gaussian Process */
    val = mf_from_previous_GP(x, y, z, dn, mfs);
    break;
  case 3 : /* Subtract a global value from a fitted function */
    val = mf_from_local_betas(x ,y, z, dn, mfs);
    break;
  default :
    printf("Invalid meanftype %i, exiting!", mfs->mftype);
    exit(-4);
  }

  return val;
}


/* Routines and code for learning the beta parameters here */
#include <nlopt.h>

struct beta_opt_auxpars{
  uint ndata;  /* Number of datapoints in array */
  float *t;    /* Dependent variable */
  float *data; /* Independent variable */
  uint ncoeff; /* Number of betas, this is hackish. */
  int nnbs;    /* Number of neighbors, whose values we take as priors */
  float **nbs; /* Arrays of the betas for the grid indexes of those neighbors */
  double *gradtmp; /* Array for holding temporary data for gradient
		      if loss function requires a gradient evaluation. */

  float (*meanfunc)(float, float*); /*mean function to optimize the parameters for */
  void (*meanfunc_grad)(float, float*, double*); /* gradient of the mean function */
};


double meanfloss(uint n, const double *x, double *grad, void *aux) {
  /* MRF likelihood contribution from one vertex and adjacent edges;
     squared error so Gaussian - however no normalization is done -
     even for the edge potentials. FIXME: The different betas should
     ultimately have different weights, and edge weights should vary
     inversely with vertex distances from each other.

     When NLOPT runs this with derivative-free algorithms, grad is
     NULL and associated sections are not executed.
  */

  size_t i, j;
  double result = 0;
  double tmp, tt;
  float edgepot = 0;
  struct beta_opt_auxpars *a = (struct beta_opt_auxpars *) aux;

  float xf[a->ncoeff]; /* Just to make the parameters floats so that the behave in meanfunc */
  for  (i=0; i<a->ncoeff; i++) {
    if (grad) { /* Initialize to zero if gradient is used */
      grad[i] = 0;
    }
    xf[i] = x[i]; /* make x from double into floats */
    for (j=0; j<a->nnbs; j++) { /* add square edge potentials */
      edgepot += powf(xf[i] - a->nbs[j][i], 2);
      if (grad) {
	grad[i] += 2*(xf[i] - a->nbs[j][i]);
      }
    }
  }

  float datascaling;
  datascaling = fminf(1., 30./a->ndata);
  /* Daynumber  = (tt - 10004800 + 43200.)/86400, as meanfunc requires */
  for (i=0; i<a->ndata; ++i) {
    tt = (a->t[i] - 10004800 + 43200.)/86400;
    tmp = -(a->meanfunc(tt, xf) - a->data[i]);
    result += datascaling*tmp*tmp;
    if (grad) {
      /* Get part of gradient here */
      a->meanfunc_grad(tt, xf, a->gradtmp);
      for (j=0; j<a->ncoeff; j++) {
	grad[j] += (double) a->gradtmp[j]*2*tmp*datascaling;
      }
    }
  }
  if (grad) {
    for (i=0; i<a->ncoeff; i++) {
      a->gradtmp[i] = grad[i];
    }
  }
  /* This was done in optimization-only but leads to Gibbs phenomenon
     if prior is weighted too much with proper MRF calculation

     result /= a->ndata;
     printf("result, edgepot: %f, %f\n", result, edgepot); */
  return result + (double) edgepot;
}

void fit_beta_parameters(struct mfstruct *mfs, float *x0, uint ndata, float *t, float *data, int nnbs, float **nbs, int npars, int *paridx) {

  /* Fits parameters beta, whose first value is given by x0, and of
     which there are ncoeff, so that meanfunc with those coefficients
     fits the data best. The dependent variable is t, and independent
     is *data - e.g. CO2 concentration close to the grid location at
     any given moment t. There are ndata pairs in the training
     set. The limits for parameters beta are given in beta_low and
     beta_high.

     The parameter npars, if set to -1, means that all the mfs->ncoeff
     parameters are optimized over. If npars is a positive integer,
     then that number of parameters is calibrated, and their indexes
     are given in array paridx.
 */

  uint i, j;
  double x[mfs->ncoeff];
  struct beta_opt_auxpars aux;
  nlopt_opt opt;

  double optlim_low[mfs->ncoeff];
  double optlim_high[mfs->ncoeff];

  /* Make sure the npars and npars_idx were set right */
  assert((npars == -1) || ((npars > 0) || (npars < mfs->ncoeff + 1)));

  aux.ndata = ndata;
  aux.t = t;
  aux.data = data;
  aux.meanfunc = mfs->meanfunc;
  aux.ncoeff = mfs->ncoeff;
  aux.nnbs = nnbs;
  aux.nbs = nbs;
  /* The following is only needed for reporting; apparently NLOpt does
     some magic memory handling... */
  aux.gradtmp = calloc(mfs->ncoeff, sizeof(double*));

  /* Set the meanfunction gradient here
     FIXME, SHOULD THIS BE MOVED TO MFSTRUCT? */
  aux.meanfunc_grad = &grad_helper_global_mean_on_day;

  /* Set initial parameter values for the optimization */
  for (i=0; i<mfs->ncoeff; i++) {
    x[i] = x0[i];
    optlim_low[i] = x0[i];
    optlim_high[i] = x0[i];
    if (npars > 0) {
      for (j=0; j<npars; j++) {
	if (paridx[j] == i) { /* If i in paridx, we set real limits */
	  optlim_low[i] = mfs->beta_low[i];
	  optlim_high[i] = mfs->beta_high[i];
	  break;
	}
      }
    } else if (npars == -1) {
      for (j=0; j<mfs->ncoeff; ++j) {
	optlim_low[j] = mfs->beta_low[j];
	optlim_high[j] = mfs->beta_high[j];
      }
    }
  }

  /* Algorithm and dimensionality. Looks like BFGS and Nelder-Mead
     work well, but since BFGS uses gradients, it's possibly a little
     faster. Nelder-Mead is pretty fast too. */
  // opt = nlopt_create(NLOPT_LN_NELDERMEAD, mfs->ncoeff);
  opt = nlopt_create(NLOPT_LD_LBFGS, mfs->ncoeff);
  nlopt_set_lower_bounds(opt, optlim_low);
  nlopt_set_upper_bounds(opt, optlim_high);
  nlopt_set_min_objective(opt, meanfloss, &aux);
  nlopt_set_ftol_rel(opt, 1e-9);
  nlopt_set_maxeval(opt, 1000);

  if (DEBUG) {
    for (int o=0; o<5; o++) {
      printf("VALS / xlow, x, xhigh IN: %g, %g, %g\n", mfs->beta_low[o], x[o], mfs->beta_high[o]);
    }
  }

  double minf; /* the minimum objective value, upon return */
  int s = nlopt_optimize(opt, x, &minf);

  /* No one wants to read all this stuff anyway...comment out for debug. */
  if (DEBUG) {
    if ((s < 0)) {
      printf("nlopt failed with status %d!\n", s);
      printf("last gradient: %g, %g, %g, %g, %g\n", aux.gradtmp[0], aux.gradtmp[1], aux.gradtmp[2], aux.gradtmp[3], aux.gradtmp[4]);
    }
    else {
      printf("found minimum at f(%g, %g, %g, %g, %g) = %0.10g\n", x[0], x[1], x[2], x[3], x[4], minf);
      printf("last gradient: %g, %g, %g, %g, %g\n", aux.gradtmp[0], aux.gradtmp[1], aux.gradtmp[2], aux.gradtmp[3], aux.gradtmp[4]);
    }
  }

  /* Copy data to original array and return */
  for  (i=0; i<mfs->ncoeff; i++) {
    x0[i] = (float) x[i];
  }

  // FIXME THIS SHOULD BE A CONFIG OPTION!
  /* Let's keep the periodic functions close to zero. The values of
     the trigonometric components have period of 2*M_PI, but then
     again the corresponding betas can just change sign, so they can
     compensate after. Smoother fields are produced this way. */
  int changed = 0;
  if (x0[4] > M_PI*.85) {
    x0[4] = x0[4] - M_PI;
    changed = 1;
  }
  if (x0[4] < -M_PI*.85) {
    x0[4] = x0[4] + M_PI;
    changed = 1;
  }
  if (DEBUG && changed) {
    printf("Fifth function parameter changed. If your fifth parameter is NOT a phase parameter of a trigonometric function (with phase of 2 PI, 1 PI, .5 PI etc.), you'll need to uncomment the lines around this comment in mean_functions.h.\n");
  }



  if (DEBUG) {
    for (int o=0; o<5; o++) {
      printf("VALS / xlow, x, xhigh OUT: %g, %g, %g\n", mfs->beta_low[o], x0[o], mfs->beta_high[o]);
    }
  }


  nlopt_destroy(opt);
}


void multiply_gaussians(uint n, float *mu1, float *P1, float *mu2, float *P2) {
  /* Multiplies two Gaussians of size n. mu1 and upper diagonal of P1
     will be overwritten. P1 and P2 are the precision matrices. */

  int i;
  if (DEBUG) {
    print_matrix(P1, n, n, "P1", 0);
    print_matrix(P2, n, n, "P2", 0);
    print_matrix(mu1, 1, n, "mu1", 0);
    print_matrix(mu2, 1, n, "mu2", 0);
  }

  /* Calculate P1@mu1 and P1@mu2 */
  float *v1 = malloc(n*sizeof(float));
  float *v2 = malloc(n*sizeof(float));
  cblas_ssymv(CblasRowMajor, CblasUpper, n, 1, P1, n, mu1, 1, 0, v1, 1);
  cblas_ssymv(CblasRowMajor, CblasUpper, n, 1, P2, n, mu2, 1, 0, v2, 1);

  if (DEBUG) {
    print_matrix(v1, 1, n, "v1", 0);
    print_matrix(v2, 1, n, "v2", 0);
  }

  /* After this, both P1 and P2 are now sums of P1 and P2. This is
     needed, since P1 is the updated precision, and P2 will be
     overwritten if sgetrf/sgetri/spotrf/spotri are used. */
  for (i=0; i<n*n; i++) {
    P2[i] += P1[i];
    P1[i] = P2[i];
  }
  for (i=0; i<n; i++) {
    v1[i] += v2[i];
  }

  if (DEBUG) {
    print_matrix(P1, n, n, "P12", 0);
    print_matrix(P2, n, n, "P12 second", 0);
    print_matrix(v1, 1, n, "v12", 0);
  }

  // LAPACKE_spotrf(LAPACK_ROW_MAJOR, 'U', (int)n, P2, (int)n);
  // LAPACKE_spotri(LAPACK_ROW_MAJOR, 'U', (int)n, P2, (int)n);

  /* FIXME LATER. Cholesky failed sometimes due to instability. Let's
     use LU factorization instead. But that should be the same as
     cholesky. Precision matrix was not PD, so now should be fine*/
  int *ipiv = malloc(n*sizeof(int));
  float *work = malloc(n*sizeof(float));
  LAPACKE_sgetrf(LAPACK_ROW_MAJOR, n, n, P2, n, ipiv);
  LAPACKE_sgetri(LAPACK_ROW_MAJOR, n,  P2, n, ipiv);

  // This is faster. Check that it works correctly and use it instead
  // LAPACKE_sgesv(LAPACK_ROW_MAJOR, n, n, P2, n, ipiv, v1, n);

  for (i=0; i<n; i++) {
    mu1[i] = v1[i];
  }

  cblas_ssymv(CblasRowMajor, CblasUpper, n, 1, P2, n, v1, 1, 0, mu1, 1);

  if (DEBUG) {
    print_matrix(P2, n, n, "InvP3", 0);
    print_matrix(mu1, 1, n, "mu1 after", 0);
  }

  free(ipiv);
  free(work);
  free(v1);
  free(v2);
}


void fit_beta_parameters_with_unc(struct state *S, struct config *E, size_t r, int nnbs, float **nbcoeffs,
				  float **nbcoeffs_prec, float *nbdistanceweights, float dscale) {

  /*
     Calculates the mean and variance of the beta parameters by

     beta ~ N((F^T K^-1 F + Prec)^−1 F^T K^-1 (dv - F beta0), (F^T K^-1 F )^-1)

     where F_ij is the ith basis function of the beta at jth
     observation and K is the covariance matrix, and Prec is the
     inverse of prior covariance with mean beta0.

     dscale is the distance scale. It is gridres-independent.
  */

  size_t i, j, k, l;
  float t;
  float *arr_2d = E->mfs->arr_2d; /* shorthand */
  float *arr_2d_prec = E->mfs->arr_2d_prec; /* shorthand */

  /* Construct K^-1 */
  int nnonbetas = 1; /* Number of mean function parameters that are
			NOT to be taken to account in F - e.g. any
			phase shifts in trigonometric functions. */
  int nbetas = E->mfs->ncoeff - nnonbetas;

  /* Calculate the prior precision matrix here */
  float *Prec = calloc(nbetas*nbetas, sizeof(float));
  float *tmp = calloc(nbetas*nbetas, sizeof(float));
  float *priormeanbetas = calloc(nbetas, sizeof(float));
  /* df for distance factor. Controls how much we listen to the prior:
     This should change according to grid resolution */
  float df;
  /* Don't calculate coefficients if we have less than this amount of
     data. The optimization can give crazy values and just fail
     otherwise. */
  size_t nobs_threshold = E->mfs->ncoeff;

  /* Observations used for fitting come from the cholstruct */
  struct cholstruct *C = create_cholstruct(S, E, r, -999, NULL, 1, NULL, NULL);

  /* Output the observations in C to a file*/
  if (1) { /* FIXME make this into a config option */
    size_t ncol = 4;
    /* get the number r as str */
    int length = snprintf(NULL, 0, "%zu", r);
    char* str = malloc(length + 1);
    snprintf(str, length + 1, "%zu", r);
    float tmplat;
    float tmplon;

    float *buf = malloc(C->nobs*ncol*sizeof(float));
    for  (i=0; i<C->nobs; i++) {
      xyz_to_latlon(S->x[C->obsind[i]], S->y[C->obsind[i]],
		    S->z[C->obsind[i]], &tmplat, &tmplon);
      buf[ncol*i] = tmplat;
      buf[ncol*i+1] = tmplon;
      buf[ncol*i+2] = S->t[C->obsind[i]];
      buf[ncol*i+3] = S->dv[C->obsind[i]];
    }

    write_1d_array_to_txt(str, buf, C->nobs, ncol, 0);
    free(str);
    free(buf);
  }

  /* nbetas numbers needed for computing the beta coefficients, and
     with optimization ncoeff space is needed */
  float *means = malloc(E->mfs->ncoeff * sizeof(float));

  float *t2 = NULL;
  float *dv_orig = NULL;

  /* Fill all the means with the current best estimate of the
     parameters. */
  for (j=0; j<E->mfs->ncoeff; j++) {
    means[j] = arr_2d[j*E->ngp + r];
  }
  // Take the average of the mean values around current point as the
  // starting value. For some reason this fails, might be a race
  // condition, or other problem
  /*
  for (j=0; j<E->mfs->ncoeff; j++) {
    l = 0;
    for (k=0; k<nnbs; k++) {
      printf("r=%zu neighbor %zu = %f\n", r, k, nbcoeffs[k][j]);
      means[j] += nbcoeffs[k][j];
      l++;
    }
    means[j] = means[j] / (l+1);
    printf("=>mean value at r=%zu after neighbors: %f\n", r, means[j]);
  }
  */


  /* --- 1. Find optimal starting value of all parameters with NLOpt --- */

  if (C->nobs > nobs_threshold) {
    /* Construct the time and dv arrays. */
    t2 = malloc(C->nobs * sizeof(float));
    /* prior mean has been subtracted from dv, so it's no good for
       this calculation. For cleanness and clarity we create a new
       array for the optimization. */
    dv_orig = malloc(C->nobs * sizeof(float));
    for (j=0; j<C->nobs; j++) {
      t2[j] = S->t[C->obsind[j]];
      dv_orig[j] = S->dv[C->obsind[j]];
    }

    /* Run optimization over ALL parameters, to get point estimates of
       the non-beta type parameters */

    /* FIXME no distance weighting currently for prior in
       optimization, no proper prior weighting */
    fit_beta_parameters(E->mfs, means, (uint) C->nobs, t2, dv_orig, nnbs, nbcoeffs, -1, NULL);

    /* Record the optimized coefficients in arr_2d. The beta part will
       be still used for calculation of the F matrix used in the
       generalized Tikhonov regularization below.

       Update values only if optimization did not go to limits, and if
       we have no nans as new values. */

    int status = 1;
    for (j=0; j<E->mfs->ncoeff; j++) {
      if ((means[j] == E->mfs->beta_high[j]) || (means[j] == E->mfs->beta_low[j])
	  || (!isnormal(means[j]))) {
	status = 0;
	break;
      }
    }
    /* You can try to restrict the values by putting if (status) here
       instead, but almost certainly the fit is not going to be good
       then */
    if (1) {
      if (DEBUG) {
	printf("optimized values for gi %zu: ", r);
      }
      for (j=0; j<E->mfs->ncoeff; j++) {
	arr_2d[j*E->ngp + r] = means[j];
	if (DEBUG) {
	  printf("%f, ", means[j]);
	}
      }
      if (DEBUG) {
	printf("\n");
      }
    }
  }

  /* --- 2. Calculate the effect of the prior and edges --- */

  /* These parameters determine the distance scaling factor
     limits. Notably, if dfmax is too large, the algorithm starts to
     misbehave (somehow precision gets too large and accumulates
     leading to diagonal bands.). The value of 0.3 is safe, and 0.5
     sometimes already fails. Of course this depends on the actual way
     of calculating the distance scaling below. */
  float dfmin = 0.0001;
  float dfmax = 1.;

  if (nnbs > 0) { /* Unpack first neighbor to prec matrix */
    df = fmax(dfmax*expf(-fabsf(nbdistanceweights[0]*E->gridres/dscale)), dfmin);
    // df = fmin(fmax(dscale/nbdistanceweights[0], dfmin), dfmax);
    if (DEBUG) {
      printf("distance weight, r,dscale, nbdw: %f %ld, %f, %f\n", df, r, dscale, nbdistanceweights[0]);
    }

    for (i=0; i<nbetas; i++) {
      priormeanbetas[i] = nbcoeffs[0][i];
      if (DEBUG) {
	printf("Neighbor priormeanbeta: %f\n", priormeanbetas[i]);
      }
      for (j=0; j<nbetas; j++) {
	Prec[i*nbetas + j] =  df*nbcoeffs_prec[0][i*E->mfs->ncoeff + j];
      }
    }
  }
  /* Update Prec with other neighbors to get total prior precision */
  for (k=1; k<nnbs; k++) {
    df = fmax(dfmax*expf(-fabsf(nbdistanceweights[k]*E->gridres/dscale)), dfmin);
    // df = fmin(fmax(dscale/nbdistanceweights[k], dfmin), dfmax);
    if (DEBUG) {
      printf("distance weight, r,dscale, nbdw: %f %ld, %f, %f\n", df, r, dscale, nbdistanceweights[k]);
    }
    for (i=0; i<nbetas; i++) {
      for (j=0; j<nbetas; j++) {
	tmp[i*nbetas + j] = df*nbcoeffs_prec[k][i*E->mfs->ncoeff + j];
      }
    }

    multiply_gaussians((uint) nbetas, priormeanbetas, Prec, nbcoeffs[k], tmp);
    if (DEBUG) {
      print_matrix(Prec, nbetas, nbetas, "Prec", 0);
      print_matrix(priormeanbetas, 1, nbetas, "Prior Mean", 0);
    }
  }

  /* Set prior for nonlinear regression here */
  float *constpriorprec = calloc(nbetas*nbetas, sizeof(float));
  float *constpriormean = calloc(nbetas, sizeof(float));

  /*  Using flat prior by setting prior precision to zero. Use
      something nonzero to use a prior. Example commented out. */
  for (i=0; i<nbetas; i++) {
    constpriormean[i] = .5*(E->mfs->beta_high[i] + E->mfs->beta_low[i]);
    constpriorprec[i*(nbetas + 1)] = 0;
    // constpriorprec[i*(nbetas + 1)] = powf(.01*(E->mfs->beta_high[i] - E->mfs->beta_low[i]),-2);
  }

  multiply_gaussians((uint) nbetas, priormeanbetas, Prec, constpriormean, constpriorprec);

  /* --- 3. Update mean and precision with observations --- */

  /* Do matrix calculations here only if we have at least some
     data. Otherwise will get a million warnings from LAPACK */
  if (C->nobs > nobs_threshold) {
    float *dv = malloc(C->nobs*sizeof(float));
    float *K = calloc(C->nobs*C->nobs, sizeof(float));
    for (i=0; i<C->nobs; i++) {
      dv[i] = (float) C->dv[i];
      for (j=0; j<C->nobs; j++) {
	K[i*C->nobs + j] = (float) C->K[i*C->nobs + j];
      }
    }
    if (DEBUG) {
      write_1d_array_to_txt("K.txt", K, C->nobs, C->nobs, 0);
    }

    LAPACKE_spotrf(LAPACK_ROW_MAJOR, 'U', (lapack_int)C->nobs, K, (lapack_int)C->nobs);
    LAPACKE_spotri(LAPACK_ROW_MAJOR, 'U', (lapack_int)C->nobs, K, (lapack_int)C->nobs);
    if (DEBUG) {
      write_1d_array_to_txt("Kinv.txt", K, C->nobs, C->nobs, 0);
    }
    /* Construct F. This is tied to the particular form of mf used,
       unfortunately. */
    float *F = calloc(C->nobs*nbetas, sizeof(float));
    for (i=0; i<C->nobs; i++) {
      /* Get the time of the observation that we are looking at */
      t = (float) (S->t[C->obsind[i]] - 10004800 + 43200)/86400./365.25*2*M_PI;
      F[i*nbetas] = (float)sinf(t + arr_2d[r + 4*E->ngp]);
      F[i*nbetas + 1] = cosf(2*t + arr_2d[r + 4*E->ngp]);
      F[i*nbetas + 2] = 1.;
      F[i*nbetas + 3] = t;
    }

    /* matrix sizes:
       F: nobs x nbetas
       C->K nobs x nobs
       T: nbetas x nobs
	 => Note! T transpose is used instead,
	 with dimensions nobs x nbetas
       D: nbetas x nbetas
       DT: nbetas x nobs
	 => Note! DT transpose is used instead,
	 with dimensions nobs x nbetas
       Prec: nbetas x nbetas (prior precision from messages)
    */

    if (DEBUG) {
      write_1d_array_to_txt("F.txt", F, C->nobs, nbetas, 0);
      print_matrix(F, C->nobs, nbetas, "F MATRIX", 0);
    }

    /* Note! LAPACK Leading dimension seems to refer to number of
       _columns_ in row-major ordering. */

    /* Calculate T = F^T K^-1 */
    float *T = calloc(C->nobs*nbetas, sizeof(float));

    /* This is now K^-1@F instead of K^T@K^-1, so T is transpose */
    cblas_ssymm(CblasRowMajor, CblasLeft, CblasUpper, C->nobs,  nbetas, 1., K, (int) C->nobs, F, nbetas, 0, T, nbetas);

    if (DEBUG) {
      write_1d_array_to_txt("T.txt", T, C->nobs, nbetas, 0);
    }

    /* Calculate D = (T F + Prec)^-1 => inverse covariance*/
    float *D = calloc(nbetas*nbetas, sizeof(float));
    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, nbetas, nbetas, C->nobs, 1., T, nbetas, F, nbetas, 0., D, nbetas);
    if (DEBUG) {
      /* write_1d_array_to_txt("D.txt", D, nbetas, nbetas, 0); */
      print_matrix(D, nbetas, nbetas, "PREC1 TO BE WRITTEN",  0);
    }
    /* D = T.T@F calculation is correct */

    /* Add prior precision matrix to D before inverting to get
       covariance of betas */
    for (i=0; i<nbetas*nbetas; i++) {
      D[i] += Prec[i];
    }

    /* Store precision before it is turned into covariance */
    for (j=0; j<nbetas; j++) {
      for (k=0; k<nbetas; k++) {
	/* Symmetric, but we pack a nbetas x nbetas matrix into one of
	   size ncoeff x ncoeff */
	arr_2d_prec[(j*E->mfs->ncoeff + k)*E->ngp + r] = D[j*nbetas + k];
	arr_2d_prec[(k*E->mfs->ncoeff + j)*E->ngp + r] = D[j*nbetas + k]; /* transpose */
      }
    }

    /* Invert D to get covariance. Should be symmetric so this function
       is fine again */
    LAPACKE_spotrf(LAPACK_ROW_MAJOR, 'U', nbetas, D, nbetas);
    LAPACKE_spotri(LAPACK_ROW_MAJOR, 'U', nbetas, D, nbetas);
    if (DEBUG) {
      write_1d_array_to_txt("DwithInv.txt", D, nbetas, nbetas, 0);
    }
    /* Dinv is roughly ok. With 150 days data the relative error is
       around 1e-5. Morale of the story: use enough data / enough
       spread in the time stamps of observations. */

    /* Calculate D@T. This is really T.T@D, and yields DT.T */
    float *DT = calloc(C->nobs*nbetas, sizeof(float));
    cblas_ssymm(CblasRowMajor, CblasRight, CblasUpper, C->nobs, nbetas, 1., D, nbetas, T, nbetas, 0, DT, nbetas);
    if (DEBUG){
      write_1d_array_to_txt("DT.txt", DT, C->nobs, nbetas, 0);
    }
    /* This is correct. With 150 days, relative errors are very small,
       except in the rare case when divisor is very close to zero, and
       relative error might then be even 30%, but absolute error is
       always very small. */

    /* Calculate -F@priormeanbetas + dv as needed in generalized
       Tikhonov regularization. */
    cblas_sgemv(CblasRowMajor, CblasNoTrans, C->nobs, nbetas, -1., F, nbetas, priormeanbetas, 1, 1, dv, 1);

    /* Calculate the last product between DT and (y - F@beta) */

    cblas_sgemv(CblasRowMajor, CblasTrans, C->nobs, nbetas, 1., DT, nbetas, dv, 1, 0, means, 1);

    /* Add prior means to get the means */
    if (DEBUG) {
      printf("Additions to mean, starting at %f, %f, %f, %f: \n",
	     means[0], means[1], means[2], means[3]);
    }

    for (i=0; i<nbetas; i++) {
      means[i] += priormeanbetas[i];
      if (DEBUG) {
	printf("%f, ", priormeanbetas[i]);
      }
    }
    if (DEBUG) {
      printf("\n");
      print_matrix(means, 1, nbetas, "new mean", 0);
    }
    /* 4. Optimize finally over the non-betas only */

    /* Create array containing indexes of non-beta parameters */

    int nnonbetas = (E->mfs->ncoeff - nbetas);
    int *nonbetas = malloc(nnonbetas*sizeof(int));
    for (j=0; j<nnonbetas; j++) {
      nonbetas[j] = nbetas + j;
    }
    // Parameter changes are restricted to only the nonbetas here
    fit_beta_parameters(E->mfs, means, (uint) C->nobs, t2, dv_orig, 0, nbcoeffs, 1, nonbetas);
    free(nonbetas);


    // Alternative: optimize over all parameters here
    // int *testbetas = malloc(E->mfs->ncoeff);
    // for (j=0; j<E->mfs->ncoeff; j++) {
    //   testbetas[j] = j;
    // }
    // fit_beta_parameters(E->mfs, means, (uint) C->nobs, t2, dv_orig, 0, nbcoeffs, 5, testbetas);

    /* Store the mean */
    for (j=0; j<E->mfs->ncoeff; j++) {
      arr_2d[j*E->ngp + r] = means[j];
    }

    /* --- 5. Clean up --- */
    free(t2);
    free(dv_orig);
    free(F);
    free(T);
    free(D);
    free(DT);
    free(means);
    free(dv);
    free(K);
  } else { /* We go with the prior values, as there was no data. */
    printf("No data, going with prior values!\n");
    for (j=0; j<nbetas; j++) {
      arr_2d[j*E->ngp + r] = priormeanbetas[j];
      for (k=0; k<nbetas; k++) {
	/* Symmetric, but we pack a nbetas x nbetas matrix into one of
	   size ncoeff x ncoeff */
	arr_2d_prec[(j*E->mfs->ncoeff + k)*E->ngp + r] = Prec[j*nbetas + k];
	arr_2d_prec[(k*E->mfs->ncoeff + j)*E->ngp + r] = Prec[j*nbetas + k]; /* transpose */
      }
    }
  }

  /* FIXME Add still optimization over the delta parameter for a
     missing corner case. */
  free(Prec);
  free(tmp);
  free(priormeanbetas);
  teardown_cholstruct(C);
}


void single_MRF_marginal(struct state *S, struct config *E, size_t gi, float dscale) {
  /* Calculates a single marginal for the beta parameters and an
     optimum for the delta parameters for the grid point gi. */

  /* nnbs is number of neighbors of current grid point, nbs holds
     their indexes. For regularization, we need to tell what grid
     indexes are in the adjacent points. At the borders, we do not
     have such points. That's why there is the need to also track
     nnbs.

     Skip neighbors whose precision matrix first element is zero
     - assuming that no calculation has been done and that it is
     not a good idea to calculate those
  */

  struct mfstruct *mfs =  E->mfs; /* shorthand */
  size_t j, k, k2, l;
  int nnbs, nb;

  /* nbinds contains grid indexes of neighbors */
  size_t *nbinds = malloc(4*sizeof(size_t));

  /* Neighbor coefficient arrays for current point: these are mean
     values and precision matrices */
  float **nbcoeffs = malloc(4*sizeof(float*));
  float **nbcoeffs_prec = malloc(4*sizeof(float*));

  /* Grid points are closer to each other closer to poles. Prior
     weight is scaled as negative half exponent of distance. Therefore
     for neighbors with grid indexes gi + 1 and gi - 1 this number is
     scaled down by sin(90-lat) */
  float *nbdistanceweights = calloc(4, sizeof(float));

  j = 0;
  nnbs = 4;
  if ((gi%E->nlon == 0) || (!mfs->arr_2d_prec[gi-1])) {
    nnbs--;
  } else  {
    nbdistanceweights[j] = sinf((90.-E->lat[gi/E->nlon])/180*M_PI);
    nbinds[j++] = gi - 1;
  }
  if ((gi%E->nlon == E->nlon - 1) || (!mfs->arr_2d_prec[gi+1])) {
    nnbs--;
  } else {
    nbdistanceweights[j] = sinf((90.-E->lat[gi/E->nlon])/180*M_PI);
    nbinds[j++] = gi + 1;
  }
  if ((gi/E->nlon == 0) || (!mfs->arr_2d_prec[gi-E->nlon])) {
    nnbs--;
  } else {
    nbdistanceweights[j] = 1;
    nbinds[j++] = gi - E->nlon;
  }
  if ((gi/E->nlon == E->nlat - 1) || (!mfs->arr_2d_prec[gi+E->nlon])) {
    nnbs--;
  } else {
    nbdistanceweights[j] = 1;
    nbinds[j++] = gi + E->nlon;
  }

  for (l=0; l<4; l++) { /* Max 4 neighbors */
    nbcoeffs[l] = calloc(mfs->ncoeff, sizeof(float)); /* Initialize to zero. */
    nbcoeffs_prec[l] = calloc(mfs->ncoeff*mfs->ncoeff, sizeof(float)); /* Initialize to zero. */
  }
  /* nbcoeffs[nb][k] has the means, and nbcoeffs_prec[nb][k*ncoeff + k2]
     has the [k,k2] element of the precision matrix of the nb'th neighbor */
  for (nb=0; nb<nnbs; nb++) {
    for (k=0; k<mfs->ncoeff; k++) {
      nbcoeffs[nb][k] = mfs->arr_2d[nbinds[nb] + k*mfs->refconfig->ngp];
      for (k2=0; k2<mfs->ncoeff; k2++) {
	nbcoeffs_prec[nb][k2+k*mfs->ncoeff] = mfs->arr_2d_prec[nbinds[nb] + (k2+k*mfs->ncoeff)*mfs->refconfig->ngp];
      }
    }
    if (DEBUG) {
      print_matrix(nbcoeffs[nb], 1, mfs->ncoeff,
		   "Neighbor before fit_beta_parameters_with_unc()", 1);
      print_matrix(nbcoeffs_prec[nb], mfs->ncoeff, mfs->ncoeff,
		   "Neighbor prec before fit_beta_parameters_with_unc()", 1);
    }
  }

  /* Fit the parameters here - this function does the heavy lifting */
  fit_beta_parameters_with_unc(S, E, gi, nnbs, nbcoeffs, nbcoeffs_prec,
			       nbdistanceweights, dscale);

  for (l=0; l<4; l++) {
    free(nbcoeffs[l]);
    free(nbcoeffs_prec[l]);
  }
  free(nbdistanceweights);
  free(nbcoeffs);
  free(nbcoeffs_prec);
  free(nbinds);
}


void fit_all_beta_parameters(struct state *S, struct config *E, size_t gi0, size_t gi1, float dscale, uint opt_iters) {

  /* For each gridpoint, fits the beta parameters, which will be kept
     in mfs->arr2d. The fitting is done several times, to get the edge
     potentials settled, since vertex expectations, in Markov Random
     Field parlance, depend on each other, as can be seen in the loss
     function above.
  */

  assert(!E->mode); /* Gridspec is not created if using a random sample. Set E->mode to zero instead. */

  size_t gi, i, j, ndiags, ndiags_tot;
  int m, inc;

  /* Calculating diagonal by diagonal */
  ndiags = E->nlon + E->nlat - 1;
  /* linear ordering: this many diagonals need to be calculated in
     total */
  ndiags_tot = 2*opt_iters*(ndiags-1)+1;

    /*
    Python version of iteration below is something like this:
    For getting the gi's given the order of the diagonal n

    def get_gi(n, nrows, ncols):
	for i in range(n+1):
	    if n-i < nrows and i < ncols:
		print(n-i, i)

    To go to and fro between first and last diagonals (j here is the
    diagonal in the end, and I is the number of diagonals):

    def get_diag(d, I):
	j = 0
	inc = -1
	for i in range(2*I*(d-1)+1):
	    print(j)
	    if not i%(d-1):
		inc = -1*inc
	    j = j + inc
    */

    /* i runs from corner to corner and back */
    inc = -1;
    j = 0;
    size_t k2 = 0;

    for (i=0; i<ndiags_tot; i++) {

      printf("Calculating diagonal %ld which is %ld/%ld\n", j, i+1, ndiags_tot);
#pragma omp parallel for private(m,gi) shared(i,S,E,gi0,gi1,dscale) num_threads(NTHREADS) schedule(runtime)
      for (m=0; m<j+1; m++) { //read j as n and m as i comparing to python code above
	if ((j - m < E->nlat) && (m < E->nlon)) {
	  // get gi of latitude i-m, longitude m
	  gi = (j-m)*E->nlon + m;
	  if ((gi < gi0) || (gi > gi1)) {

	    continue;
	  }
	  if (DEBUG) {
	    printf("START: i,j,,m,gi, nlon: %ld, %ld, %d, %zu, %ld\n", i, j, m, gi, E->nlon);
	  }
	} else {
	  continue;
	}

	/* Empty the precision matrix for current point before
	   computing parameter estimates. Not doing this will make our
	   precision to become infinitely good after a couple of
	   sweeps of the graph. This way we don't end up using data
	   many times for conditioning.. */
	for (k2=0; k2<(E->mfs->ncoeff)*(E->mfs->ncoeff); k2++) {
	  E->mfs->arr_2d_prec[gi+k2*E->ngp] = 0;
	}

	single_MRF_marginal(S, E, gi, dscale);
      }
      if (!(i%(ndiags-1))) {
	inc = -inc;
      }
      j += inc;
    }

    write_1d_array_to_txt("beta0.txt", E->mfs->arr_2d, E->nlat, E->nlon, 0);
    write_1d_array_to_txt("beta1.txt", E->mfs->arr_2d + E->ngp, E->nlat, E->nlon, 0);
    write_1d_array_to_txt("beta2.txt", E->mfs->arr_2d + 2*E->ngp, E->nlat, E->nlon, 0);
    write_1d_array_to_txt("beta3.txt", E->mfs->arr_2d + 3*E->ngp, E->nlat, E->nlon, 0);
    write_1d_array_to_txt("delta.txt", E->mfs->arr_2d + 4*E->ngp, E->nlat, E->nlon, 0);

    /*
      print_matrix(E->mfs->arr_2d, E->nlat, E->nlon, "beta0", 1);
      print_matrix(E->mfs->arr_2d + E->ngp, E->nlat, E->nlon, "beta1", 1);
      print_matrix(E->mfs->arr_2d + 2*E->ngp, E->nlat, E->nlon, "beta2", 1);
      print_matrix(E->mfs->arr_2d + 3*E->ngp, E->nlat, E->nlon, "beta3", 1);
      print_matrix(E->mfs->arr_2d + 4*E->ngp, E->nlat, E->nlon, "delta", 1);
    */
}
