/* 
   Predicted wetland fraction (fw) for a model (usually paleo) climate
   based on K nearest neighbour (NN) search of reference data in 
   climate-vegetation space

   David Wilton
   The University of Sheffield
   
   
   This code was written to try out a number of different options, variations etc,
   most of which were not used in the final version. It was meant for finding a 
   version of the basic KNN method that works. Bits of code for other variations 
   has been left in, much of it redundant. They were never removed incase
   they were found to be of use later. 

   Similarly the data files in the format this expects to read in contain some 
   variables that were not found to be useful, but left in rather then remake 
   those files.

   It expects reference and model data to already be in the required format.
   Fairly straighfoward tab seperated columns, see comments later where
   reference and model data are read in.

   It would probably not be useful for someone else to use this specific code. 
   The basic kNN algorithm is very straightforward and it would be much easier to 
   write your own version for your own data sets. 



   Basic algorithm;

   For each site-month in the model climate find it's k NNs from the 
   reference data. Squared Euclidean distances are used. i.e. if the 
   climate-vegetation variables are TMP, PRC, LAI .. etc, I is a
   site-month in the model climate, J is one in the reference data

   D(IJ) = (TMP(I) - TMP(J)) * (TMP(I) - TMP(J)) +
           (PRC(I) - PRC(J)) * (PRC(I) - PRC(J)) +
	   (LAI(I) - LAI(J)) * (LAI(I) - LAI(J)) + 
	   ...... etc for all variables

   Euclidean distance ("every day" meaning of distance) is SQRT(D(IJ)) so 
   in terms of ranking candidate nearest neighbours it does not
   matter whether we used Euclidean or Squared Euclidean


   
   Climate and vegetation variables used here are

   Temperature
   Precipitation
   Leaf Area Index
   Net Primary Productivity
   Transpiration
   Evapotranspiration
   Soil Water Content
   Runoff

   
   This specific version uses a max3NN algorothim, as found to be suitable
   for an Early Eocene climate
   For another era / model another variation of KNN may be better. e.g. to do a modern day NN (i.e. max1NN) prediction this 
can simply be changed to only consider 1 rather than 3 nearest 
neighbours

   
   Simply compile with a C++ compiler and run as ./a.out
   
   This will read the reference and model (veg / climate) data from the current directory
   so change paths to other locations and filenames if used for other data

   
   This writes a brief summary of progress to stdout
   
   This creates a results file "maxkNN_pfw.dat"
    - the maxkNN prediction of FW is column 9 of this
    - columns 1 to 7 give basic details of the site being predicted (location, covind etc)
    - column 8 is an alternative meankNN prediction, very poor
    - columns 10 to 16 give details of the reference point from which the maxKNN prediction 
      is derived (included in case they were of any interest)

   

*/

#include <iostream>
#include <fstream>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <sstream>
#include <sys/time.h>
using namespace std;

int Nr=746280;// total number of site-months in reference data set

int Nt; // total number of sites in the model data,  read from command line

int Nv=8; // number of climate / vegetation variables
// these are expected to be TMP, PRC, LAI, NPP, TRN, EVT, SWC, ROF
// number, order and values of these not important, so long as it's the same
// for reference and model data 


// next two options should both be 1 to ignore potential restrictions on NN search
// i.e. we found it best not to include these restrictions
int no_stdv_orog = 1; // if 1 then ignore stdv orog restriction on reference sites
int any_type = 1; // if 1 then do not restrict reference sites to same type (wet or thaw) as target
// wet site would be one where TMP > 0 deg C all year, thaw site otherwise

float Cw=-999.9; // default values for un-used options, they could have been used for
float Ct=-999.9; // critical values of a variable above/below which we always predict fw=0

int KNN=3; // number of nearest neighbours found

// these could be made variable but 3 was found to be best for the Early Eocene climate

main (int argc, char *argv[]) {
  int i,j,k,l,m,n, nn;
  float fwC;
  float w,x,y,z;
  float dnn;
  float **X; // all reference data
  float **Xt; // all test data
  float fwNN[KNN+1], dNN[KNN+1]; // fw and distance (in clim-veg space) of site-month to it's KNN nearest neighbours
  // storing an extra one, just makes the code simpler when sorting through them
  // and store information about the NNs
  float latNN[KNN+1], lonNN[KNN+1];
  float latmax, lonmax;
  int monNN[KNN+1], covNN[KNN+1];
  int monmax, covmax;

  float meanfw, maxfw;
  int TMPw, PRCw, LAIw, NPPw, TRNw, EVTw, SWCw, ROFw; // use as logicals to control which are included, different
  int TMPt, PRCt, LAIt, NPPt, TRNt, EVTt, SWCt, ROFt; // different variables for wet and thaw sites

  float MN[Nv], SD[Nv];
  char name[100];
  FILE *f;

  Nt = atoi(argv[1]) * 12; // Nt is number of site-months, so number of sites x 12

  // optimum set of varibles from MOD climate analysis

  // If we wanted to limit the NN search to distances based on just some of the variables
  // change the following to 0 for the variables to be ignored
  // if required can have different sets of variables for wet and thaw sites
  TMPw=1;
  PRCw=1;
  LAIw=1;
  NPPw=1;
  TRNw=1;
  EVTw=1;
  SWCw=1;
  ROFw=1;

  TMPt=1;
  PRCt=1;
  LAIt=1;
  NPPt=1;
  TRNt=1;
  EVTt=1;
  SWCt=1;
  ROFt=1;

  // need to mean scale test data by mean and stdv of ref data 
  // as the latter has been mean scaled already
  // 
  // Xsc(I) = (X(I) - mean(X)) / (stdv(X)
  //
  // so that Xsc has mean of 0 and stdv of 1
  //
  // below values correspond to the specific reference data used
  // i.e. before scaling that data had the following means and stdvs for each variable
  // re-calculate these for other data
  MN[0] = 8.733553;
  SD[0] = 17.883664;
  MN[1] = 57.337879;
  SD[1] = 73.977076;
  MN[2] = 2.806854;
  SD[2] = 2.410137;
  MN[3] = 39.180324;
  SD[3] = 56.576510;
  MN[4]	= 17.411888;
  SD[4] = 23.135942;
  MN[5] = 36.504437;
  SD[5] = 45.063526;
  MN[6] = 262.645932;
  SD[6] = 71.797731;
  MN[7] = 19.5114364; 
  SD[7] = 47.296620; 


  printf("\nK-Nearest Neighbour wetland prediction on Paleo day climate, model grid");
  printf("\n\tK = %d", KNN);
      
  printf(" Wet sites, variables used for NN distances\n");
  if (TMPw)
    printf("\tTMP");
  if (PRCw)
    printf("\tPRC");
  if (LAIw)
    printf("\tLAI");
  if (NPPw)
    printf("\tNPP");
  if (TRNw)
    printf("\tTRN");
  if (EVTw)
    printf("\tEVT");
  if (SWCw)
    printf("\tSWC");
  if (ROFw)
    printf("\tROF");


  printf("\n\nThaw sites, variables used for NN distances\n");
  if (TMPt)
    printf("\tTMP");
  if (PRCt)
    printf("\tPRC");
  if (LAIt)
    printf("\tLAI");
  if (NPPt)
    printf("\tNPP");
  if (TRNt)
    printf("\tTRN");
  if (EVTt)
    printf("\tEVT");
  if (SWCt)
    printf("\tSWC");
  if (ROFt)
    printf("\tROF");
   
  X = new float*[Nr];
  for (i=0; i<Nr; i++)
    X[i] = new float[Nv+8]; // lat, lon, month, covOLD, covNEW, site_type, fw and each clim/veg variable, orog_stdv
  // last element can be used to apply thawiste or prc-evt filtering


  // READ IN REFERENCE DATA IN THIS FORMAT
  // 16 columns per line, as described below, some columns redundant - see comments at beggining
  // one line per site (location on Earth's surface) and month of the year (assuming monthly data)

  f=fopen("reference.dat","r"); 
  // tmp, prc ... rof should all be mean scaled already
  // all other variables should not be scaled
  for (i=0; i<Nr; i++) {
    fscanf(f, "%f\t%f\t%d\t%d\t%d\t%d%f", &x, &y, &m, &n, &k, &l, &z);
    X[i][0] = x; // lat
    X[i][1] = y; // lon
    X[i][2] = (float)m; // month
    
    // two different measures of majority PFT indication by cover
    // only 2nd was made use of
    X[i][3] = (float)n; // covind 1 
    X[i][4] = (float)k; // covind 2 
    
    // type of site, not used in final version
    X[i][5] = (float)l; // orignally was =1 if TMP > 0 all year (wet site), =2 otherwise (thaw site)
    
    X[i][6] = z; // fw - obersved wetland fraction
    fscanf(f, "\t%f", &x); X[i][7] = x; // tmp
    fscanf(f, "\t%f", &x); X[i][8] = x; // prc
    fscanf(f, "\t%f", &x); X[i][9] = x; // lai
    fscanf(f, "\t%f", &x); X[i][10] = x; // npp
    fscanf(f, "\t%f", &x); X[i][11] = x; // trn
    fscanf(f, "\t%f", &x); X[i][12] = x; // evt
    fscanf(f, "\t%f", &x); X[i][13] = x; // swc
    fscanf(f, "\t%f", &x); X[i][14] = x; // rof
    fscanf(f, "\t%f", &x); X[i][15] = x; // standard deviation of sub-grid scale orography or approximation of that, not used here

  }
  fclose(f);
  printf("\nread reference data");


  // MODEL DATA, read in Xt ("t" for target was used at first)
  // similar format to reference, just fewer columns
  // again columns unsused for reference data would also be unused for model data

  Xt = new float*[Nt];
  for (i=0; i<Nt; i++)
    Xt[i] = new float[Nv+7];  // lat, lon, month, covind, site_type, and each clim/veg variable, orog_stdv
  
  f=fopen("./model.dat","r"); 
  for (i=0; i<Nt; i++) {
    fscanf(f, "%f\t%f\t%d\t%d", &x, &y, &m, &k);
    Xt[i][0] = x; // lat
    Xt[i][1] = y; // lon
    Xt[i][2] = (float)m; // month
    Xt[i][3] = (float)k; // covind

    //Xt[i][4] // was site classification according to old FW model, calculated later
    // but was not used in final version 

    fscanf(f, "\t%f", &x); Xt[i][5] = x; // tmp
    fscanf(f, "\t%f", &x); Xt[i][6] = x; // prc
    fscanf(f, "\t%f", &x); Xt[i][7] = x; // lai
    fscanf(f, "\t%f", &x); Xt[i][8] = x; // npp
    fscanf(f, "\t%f", &x); Xt[i][9] = x; // trn
    fscanf(f, "\t%f", &x); Xt[i][10] = x; // evt
    fscanf(f, "\t%f", &x); Xt[i][11] = x; // swc
    fscanf(f, "\t%f", &x); Xt[i][12] = x; // rof
    fscanf(f, "\t%f", &x); Xt[i][13] = x; // orog_sdtv 
  }
  fclose(f);
  printf("\nread model %d data", i); fflush(stdout);


  
  // work out wet or thaw site classification
  // This classification was not made use of
  for (i=0; i<Nt-11; i++) {
    if (Xt[i][2] == 1.0) { 
	j=1;
	for (m=0; m<12; m++)
	  if (Xt[i+m][5] < 0.0)
	    j=0;
	if (j==1)
	  for (m=0; m<12; m++)
	    Xt[i+m][4] = 1.0;
	else
	  for (m=0; m<12; m++)
	    Xt[i+m][4] = 2.0;
      }
  }

  
  // mean scale test data clim and veg
  for (i=0; i<Nt; i++)
    for (j=0; j<Nv; j++)
      Xt[i][j+5] = (Xt[i][j+5] - MN[j]) / SD[j];

  // loop through target sites calculating predicted fw based on NN search of REF data 
  // and write results with other possibly useful variable to output file

  // the max kNN prediction of FW is column 9
  
  f=fopen("./maxkNN_pfw.dat","w");  
  for (i=0; i<Nt; i++) {    
    //                                  i   lat      lon        month         covind          type
    fprintf(f,"%d\t%f\t%f\t%d\t%d\t%d", i, Xt[i][0], Xt[i][1], (int)Xt[i][2], (int)Xt[i][3], (int)Xt[i][4]);
    fprintf(f,"\t%f", Xt[i][13]); // stdv orog
      
    for (k=0; k<KNN+1; k++) {
      fwNN[k] = 0.0;
      dNN[k] = 1.0e99;
      latNN[k] = 0.0;
      lonNN[k] = 0.0;
      monNN[k] = 0;
      covNN[k] = 0; 
    }
    
    // covind values based on SDGVM classification
    // Xt[i][3] > 2 means natural vegetation dominates (as opposed to crops or bare land)
    // Xt[i][3] = 1 is site classed as bare land
    // Xt[i][3] = 2 is site classed as crops, usually not possible for a model climate, but in principle
    // future models could have that in
    // also later in code same restriction on Reference data which being modern day does have some sites
    // where crops are the majority plant type
    if ( Xt[i][3] > 2.0 ) {  
      if ( (int)Xt[i][4] == 1 ) {      // WET sites
	// do NN search  	
	nn=-1;
	dnn=1.0e99;
	for (j=0; j<Nr; j++) {
	  // and to restrict to natural vegetation sites
	  // and to same type of site (wet for wet, thaw for thaw)
	  //      unless any_type=1 in which case no restrictions
	  if ((X[j][4] > 2.0) && ((X[j][5] == Xt[i][4]) || (any_type))) {
	    // restricted to stdv_orog above critical value, unless no_stdv_orog=1
	    if ((X[j][15] > Cw) || (no_stdv_orog) ){ 
	      y=0;
	      if (TMPw) {
		x = Xt[i][5] - X[j][7];
		y += x*x;
	      }
	      if (PRCw) {
		x = Xt[i][6] - X[j][8];
		y += x*x;
	      }
	      if (LAIw) {
		x = Xt[i][7] - X[j][9];
		y += x*x;
	      }
	      if (NPPw) {
		x = Xt[i][8] - X[j][10];
		y += x*x;
	      }
	      if (TRNw) {
		x = Xt[i][9] - X[j][11];
		y += x*x;
	      }
	      if (EVTw) {
		x = Xt[i][10] - X[j][12];
		y += x*x;
	      }
	      if (SWCw) {
		x = Xt[i][11] - X[j][13];
		y += x*x;
	      }
	      if (ROFw) {
		x = Xt[i][12] - X[j][14];
		y += x*x;
	      }
	      for (k=KNN-1; k>=0; k--) {
		if (y <= dNN[k]) {
		  dNN[k+1]=dNN[k];
		  fwNN[k+1]=fwNN[k];
		  dNN[k] = y;
		  fwNN[k] = X[j][6];
		  latNN[k] = X[j][0];
		  lonNN[k] = X[j][1];
		  monNN[k] = (int)X[j][2];
		  covNN[k] = (int)X[j][4];
		}
	      }
	    }
	  }	
	} 
	meanfw=0.0;
	maxfw=-1.0;
	for (k=0; k<KNN; k++) {
	  meanfw += fwNN[k];
	  if (fwNN[k] > maxfw) {
	    maxfw = fwNN[k];
	    latmax = latNN[k];
	    lonmax = lonNN[k];
	    monmax = monNN[k];
	    covmax = covNN[k];
	  }
	}
	fprintf(f,"\t%f\t%f\t%f\t%f\t%d\t%d", meanfw/KNN, maxfw, latmax, lonmax, monmax, covmax);
	for (k=0; k<KNN; k++)
	  fprintf(f,"\t%f", dNN[k]);
	fprintf(f,"\n");
      }
      else if ( (int)Xt[i][4] == 2 ) {      // THAW sites
	// do NN search  	
	nn=-1;
	dnn=1.0e99;
	for (j=0; j<Nr; j++) {
	  // and to restrict to natural vegetation sites
	  // and to same type of site (wet for wet, thaw for thaw)
	  //      unless any_type=1 in which case no restrictions
	  if ((X[j][4] > 2.0) && ((X[j][5] == Xt[i][4]) || (any_type))) {
	    // and retsrict to maybe-wetland class, as defined by fully evolved GA fw_max(stdv-orog)
	    
	    if ((X[j][15] > Ct) || (no_stdv_orog)) { // restricted to stdv_orog above critical value, unless no_stdv_orog=1
	      y=0;
	      if (TMPt) {
		x = Xt[i][5] - X[j][7];
		y += x*x;
	      }
	      if (PRCt) {
		x = Xt[i][6] - X[j][8];
		y += x*x;
	      }
	      if (LAIt) {
		x = Xt[i][7] - X[j][9];
		y += x*x;
	      }
	      if (NPPt) {
		x = Xt[i][8] - X[j][10];
		y += x*x;
	      }
	      if (TRNt) {
		x = Xt[i][9] - X[j][11];
		y += x*x;
	      }
	      if (EVTt) {
		x = Xt[i][10] - X[j][12];
		y += x*x;
	      }
	      if (SWCt) {
		x = Xt[i][11] - X[j][13];
		y += x*x;
	      }
	      if (ROFt) {
		x = Xt[i][12] - X[j][14];
		y += x*x;
	      }
	      for (k=KNN-1; k>=0; k--) {
		if (y <= dNN[k]) {
		  dNN[k+1]=dNN[k];
		  fwNN[k+1]=fwNN[k];
		  dNN[k] = y;
		  fwNN[k] = X[j][6];
		  latNN[k] = X[j][0];
		  lonNN[k] = X[j][1];
		  monNN[k] = (int)X[j][2];
		  covNN[k] = (int)X[j][4];
		}
	      }
	    }
	  }	
	}
	meanfw=0.0;
	maxfw=-1.0;
	for (k=0; k<KNN; k++) {
	  meanfw += fwNN[k];
	  if (fwNN[k] > maxfw) {
	    maxfw = fwNN[k];
	    latmax = latNN[k];
	    lonmax = lonNN[k];
	    monmax = monNN[k];
	    covmax = covNN[k];
	  }
	}
	fprintf(f,"\t%f\t%f\t%f\t%f\t%d\t%d", meanfw/KNN, maxfw, latmax, lonmax, monmax, covmax);
	for (k=0; k<KNN; k++)
	  fprintf(f,"\t%f", dNN[k]);
	fprintf(f,"\n");
      }
      // for any sites that don't get processed fill up line with -1 so all lines of output have same number of columns
      else {
	fprintf(f,"\t-1\t-1\t-1\t-1\t-1\t-1");
	for (k=0; k<KNN; k++)
	  fprintf(f,"\t-1");
	fprintf(f,"\n");
      }
    }
    else {
      fprintf(f,"\t-1\t-1\t-1\t-1\t-1\t-1");
      for (k=0; k<KNN; k++)
	fprintf(f,"\t-1");
      fprintf(f,"\n");
    }
    if ((i%500)==0)
      printf("\n%d sites processed", i);
  }
    
 END:;

  printf("\n\n");
}
  
