########################## Import and Prepare Data ###########################

library(ncdf4)
library(lubridate)
library(akima)
library(lattice)
library(RColorBrewer)
library(EnvStats)
library(pals)


data_path_obs = "C:/Users/Justus/Desktop/CRU_TS4.04/"
data_path_mod = "C:/Users/Justus/Desktop/AWI-ESM/"
start_year = 1930
end_year   = 2014

# import model data
ncin <- nc_open(paste0(data_path_mod,
  "pr_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_",
  start_year,"01-",start_year,"12.nc"))
lon <- ncvar_get(ncin, "lon") # lon-grid
lat <- ncvar_get(ncin, "lat") # lat-grid
nullmatrix <- matrix(0,nrow=length(lon),ncol=length(lat))
t_m <- seq(start_year,end_year+0.999 , by=1/12)
t_y <- start_year:end_year
t_len_month <- length(t_m); t_len_year <- length(t_y)
pre_mod_monthly <- array(0, dim=c(length(lon),length(lat),t_len_month))
for(t in t_y) {
  ncin <- nc_open(paste0(data_path_mod,
        "pr_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_",
        t,"01-",t,"12.nc"))
  pre_mod_monthly[,,((t-start_year)*12+1):((t-start_year+1)*12)] <- 
    ncvar_get(ncin, "pr")
}
nc_close(ncin)

# convert model data from mm/s to mm/month
monlengths <- rep(c(31,28,31,30,31,30,31,31,30,31,30,31),end_year-start_year+1)
monlengths[seq(2,length(monlengths),by=12)] <- 28 +
  1*leap_year(start_year:end_year)
for(i in 1:(length(lon))) for(j in 1:(length(lat))) {
  pre_mod_monthly[i,j,] <- pre_mod_monthly[i,j,]*3600*24*monlengths
}

# convert longitude from 0:360 to -180:180 and adjust data accordingly
lon_reorder <- c((length(lon)/2+1):length(lon), 1:(length(lon)/2))
pre_mod_monthly <- pre_mod_monthly[lon_reorder,,]
lon <- lon[lon_reorder]
lon <- lon - 1*(lon>=180)*360
rm(lon_reorder)

# import observational data
ncin <- nc_open(paste0(data_path_obs,"cru_ts4.04.1901.2019.pre.dat.nc"))
lon_obs <- ncvar_get(ncin, "lon")
lat_obs <- ncvar_get(ncin, "lat")
# data start in Jan. 1901 and are already given in mm/month
# grid points over water have NA values, missing values over land are not NA,
# but instead set to a default climatology value, in this case, the correspon-
# ding value in the array "stn" is equal to 0
pre_obs_monthly_orig <- (ncvar_get(ncin, "pre")[,,
            ((start_year-1901)*12+1):((end_year-1901)*12+12)])
stn <- (ncvar_get(ncin, "stn")[,,
            ((start_year-1901)*12+1):((end_year-1901)*12+12)])
nc_close(ncin)
# land-sea-mask: 1 = ocean, 0 = land
lsm_obs <- 1*(is.na(pre_obs_monthly_orig[,,1]))

# mark grid points with more than 5% climatology values or with at
# least 12 consecutive climatology entries as missing
missing_values_obs <- matrix(0,nrow=length(lon_obs),ncol=length(lat_obs))
for(i in 1:(length(lon_obs))) for(j in 1:(length(lat_obs))) {
  if(!is.na(stn[i,j,1])) { # grid point over land
    if(length(which(stn[i,j,]==0))>t_len_month*0.05) {
      # more than 5% missing values
      missing_values_obs[i,j] <- 1
    } else {
      for(k in 1:(t_len_month-12)) {
        if(max(stn[i,j,k:(k+12)])==0) {
          # at least 12 consecutive missing values
          missing_values_obs[i,j] <- 1
          break
        }
      }
    }
  }
}

# convert the reanalysis data set to the grid of the climate model using
# bilinear interpolation

# first convert land-sea-mask and missing-values-mask
grid <- expand.grid(lon=lon,lat=lat)
# add additional row lon=-180.25 (=179.25) for correct interpolation of margins
lon_obs_new <- c(lon_obs[length(lon_obs)]-360, lon_obs)
lsm_obs <- rbind(lsm_obs[length(lon_obs),],lsm_obs)
missing_values_obs <- rbind(missing_values_obs[length(lon_obs),],
                            missing_values_obs)

lsm <- matrix(bilinear(lon_obs_new,lat_obs,lsm_obs,grid[,1],grid[,2])$z,
              nrow=length(lon))
lsm[which(lsm>0)] <- 1 # grid cells not completely surrounded by land are
                       # treated as ocean
missing_values <- matrix(bilinear(lon_obs_new,lat_obs,missing_values_obs,
                                  grid[,1],grid[,2])$z, nrow=length(lon))
missing_values[which(missing_values>0)] <- 1

pre_obs_monthly <- array(0, dim(pre_mod_monthly))
for (i in 1:t_len_month) {
  # since the bilinear function cannot handle NA's, they are set to 0 first
  # and later in the interpolated data the sea values are set back to NA again
  tmp <- rbind(pre_obs_monthly_orig[length(lon_obs),,i],
               pre_obs_monthly_orig[,,i])
  tmp[which(is.na(tmp))] <- 0
  pre_obs_monthly[,,i] <- matrix(bilinear(lon_obs_new,lat_obs,matrix(tmp,
                    nrow=length(lon_obs_new)), grid[,1],grid[,2])$z,
                    nrow=length(lon))
}
# in both data sets: set ocean values and missing values to NA
for(i in 1:(length(lon))) for(j in 1:(length(lat))) {
  if((missing_values[i,j]==1) || (lsm[i,j]==1)) {
    pre_obs_monthly[i,j,] <- rep(NA, t_len_month)
    pre_mod_monthly[i,j,] <- rep(NA, t_len_month)
  }
}

# use stl-analysis to remove trends in the data sets
pre_obs_monthly_notrend <- pre_mod_monthly_notrend <- 
  array(NA, dim=dim(pre_mod_monthly))
for(i in 1:(length(lon))) { for(j in 1:(length(lat))) {
  if(!is.na(pre_obs_monthly[i,j,1])) {
    trend <- stl(ts(pre_obs_monthly[i,j,],frequency=12),
                 s.window=50, t.window=500)$time.series[,"trend"]
    pre_obs_monthly_notrend[i,j,] <- pre_obs_monthly[i,j,] - trend+mean(trend)
    trend <- stl(ts(pre_mod_monthly[i,j,],frequency=12),
                 s.window=50, t.window=500)$time.series[,"trend"]
    pre_mod_monthly_notrend[i,j,] <- pre_mod_monthly[i,j,] - trend+mean(trend)
  }
}}

rm(pre_obs_monthly_orig,lat_obs,lon_obs,missing_values_obs,lsm_obs,stn,tmp,
     trend)



########################## Univariate Analysis ###############################

# plotting functions
plot_map <- function(matrix, lo=min(matrix,na.rm=T),hi=max(matrix,na.rm=T),
                     main="",truncate=FALSE,exclude=missing_values,
                     colors=rev(brewer.pal(10,"RdBu")),legend=TRUE,hatching=F,
                     hatching_area=nullmatrix) {
  # this function plots the contents of "matrix" on a lat-lon grid. NA-values
  # are plotted in white, points having a value of 1 in the matrix "exlude"
  # are plotted in grey. if "hatching"==TRUE, the areas indicated by ones in
  # the matrix "hatching_area" are hatched with diagonal lines.
  # the scale limits are given by "lo" and "hi", the colors by "colors". if
  # "truncate"==TRUE, values exceeding "lo" or "hi" are truncated.
  if(truncate==TRUE) {
    matrix[which(matrix>hi)] <- hi
    matrix[which(matrix<lo)] <- lo
  }
  matrix[which((exclude==1))] <- lo-1 # these values are plotted in lightgrey
  att <- seq(lo, hi, length.out = length(colors)+1)
  if(legend==FALSE) {
    colorkey = FALSE
  }
  else {
    colorkey = list(at=c(lo,att),labels=list(at=att))
  }
  if(!hatching) {
    panel = panel.levelplot # default
  }
  else {
    panel="diag_pattern"
  }
  print(levelplot(matrix~lon*lat,grid, pretty=T,panel=panel,
                      scales=list(x=list(at=seq(-150,150,by=30)),
                                  y=list(at=seq(-60,60,by=30))),
                      xlab="lon",ylab="lat",main=main,
                      hatching_area=hatching_area,
                      aspect=length(lat)/length(lon),
                      at=c(lo-1, att), cuts=length(colors)+1,
                      col.regions=c("grey90",colors),
                      colorkey=colorkey))
}


diag_pattern <- function(hatching_area, ...) {
  # this function is used for hatching by the function plot_map
  panel.levelplot(...)
  nc <- ncol(hatching_area); nr <- nrow(hatching_area)
  # diagonal lines are drawn in the area where hatching_area==1
  # the matrix area is divided into diagonal stripes
  for(i in seq(1,(nr+nc-1),by=3)) {
    stripe_x <- (max(i-nc,0)+1) : (min(i,nr))
    stripe_y <- (nc-min(i,nc)+1) : (nc-max(i-nr,0))
    # the parts of the stripes consisting of ones are filled
    start_drawing <- 0
    for(j in 1:length(stripe_x)) {
      if(start_drawing==0) {if(hatching_area[stripe_x[j],stripe_y[j]]==1) {
        start_drawing <- 1
        start_xy <- c(stripe_x[j], stripe_y[j])
      }}
      if(start_drawing==1) {if(hatching_area[stripe_x[j],stripe_y[j]]==0 ||
                               j==length(stripe_x)) {
        start_drawing <- 0
        end_xy <- c(stripe_x[j], stripe_y[j])
        panel.linejoin(x = c(lon[start_xy[1]],lon[end_xy[1]]),
                       y = c(lat[start_xy[2]],lat[end_xy[2]]), col="black", 
                       alpha=0.5,lwd=2)
      }}
    }
  }
}

plot_map_pvalues <- function(matrix, main="",exclude=missing_values) {
  # like the function "plot_map", but with a fixed color scale adapted to
  # plotting p-values of tests
  matrix[which((exclude==1))] <- -1 # excluded values are plotted in grey
  pval <- 0.5 + 1*(matrix>0.005) + 1*(matrix>0.05) + 
    1*(matrix>0.5) + 1*(matrix>0.9) 
  pval[which(exclude==1)] <- -1
  att <- c(0, 0.005, 0.05, 0.5,0.9, 1)
  print(levelplot(pval~lon*lat, grid, pretty=T,at=(-1):5,
                      scales=list(x=list(at=seq(-150,150,by=30)),
                                  y=list(at=seq(-60,60,by=30))),
                      xlab="lon",ylab="lat",main=main,
                      aspect=length(lat)/length(lon),
                      col.regions = c("lightgrey",brewer.pal(10,
                                          "RdBu")[c(1,2,7,8,9)]),
                      colorkey=list(at=c(0,0:5),labels=list(
                        at=c(0,0:5),labels=c(0,att)))))
}

  


pre_mod <- pre_obs <- array(0,dim=c(length(lon),length(lat),t_len_year))
# calculate annual maxima
for(k in 1:t_len_year) {
  for(i in 1:(length(lon))) for(j in 1:(length(lat))) {  
    pre_mod[i,j,k] <- max(pre_mod_monthly_notrend[i,j,((k-1)*12+1):(k*12)])
    pre_obs[i,j,k] <- max(pre_obs_monthly_notrend[i,j,((k-1)*12+1):(k*12)])
  }
}

# calculate empirical mean and standard deviation
# fit data to GEV distribution and use KS-Test to determine goodness of fit
empmean_mod <- empsd_mod <- empmean_obs <- empsd_obs <- nullmatrix*NA
ks_test_mod <- ks_test_obs <- nullmatrix*NA # used for p-values of ks-tests

for(i in 1:(length(lon))) for(j in 1:(length(lat))) {
  if(!is.na(pre_mod[i,j,1])) {
    empmean_mod[i,j] <- mean(pre_mod[i,j,])
    empmean_obs[i,j] <- mean(pre_obs[i,j,])
    empsd_mod[i,j] <- sd(pre_mod[i,j,])
    empsd_obs[i,j] <- sd(pre_obs[i,j,])
    par <- egevd(pre_mod[i,j,],method="pwme")$parameters
    ks_test_mod[i,j] <- ks.test(pre_mod[i,j,],"pgevd",par[1],par[2],par[3])$p
    par <- egevd(pre_obs[i,j,],method="pwme")$parameters
    ks_test_obs[i,j] <- ks.test(pre_obs[i,j,],"pgevd",par[1],par[2],par[3])$p
  }
}
plot_map(empmean_mod, lo=0, hi=1000, truncate=T,
         main="Empirical Mean – Climate Model")
plot_map(empmean_obs, lo=0, hi=1000, truncate=T,
         main="Empirical Mean – Observations")
plot_map(empsd_mod, lo=0, hi=200, truncate=T,
         main="Empirical Standard Deviation – Climate Model")
plot_map(empsd_obs, lo=0, hi=200, truncate=T,
         main="Empirical Standard Deviation – Observations")
plot_map(empmean_mod-empmean_obs, lo=-500, hi=500, truncate=T,
         main="Empirical Mean – Anomaly")
plot_map(empsd_mod-empsd_obs, lo=-100, hi=100, truncate=T,
         main="Empirical Standard Deviation – Anomaly")
plot_map_pvalues(ks_test_mod, main="p-Values of KS-Test – Climate Model")
plot_map_pvalues(ks_test_obs, main="p-Values of KS-Test – Observations")

# qq-plots
qqplot(empmean_obs,empmean_mod, xlab="Observations", ylab="Climate Model",
       yaxt="n", main="QQ-Plot Empirical Mean");
axis(side=2, at=seq(0,1500,by=200), labels=TRUE); abline(0,1)
qqplot(empsd_obs,empsd_mod, xlab="Observations", ylab="Climate Model",
       yaxt="n", main="QQ-Plot Empirical Standard Deviation")
axis(side=2, at=seq(0,300,by=50),labels=TRUE); abline(0,1)
plot(empmean_mod-empmean_obs,  empsd_mod-empsd_obs,
     xlab="Anomaly Emp. Mean", ylab="Anomaly Emp. St. Dev.",
     main="Anomalies of Empirical Mean and Standard Deviation")
abline(h=0, col="grey"); abline(v=0, col="grey")

# estimate GEV-parameters for the locations where the distribution
# is not rejected by KS-Test

# since the clustering algorithms require the data to be given in vector
# format, not in matrix format, the GEV-parameters are saved as vectors
# the vector "sel" contains the matrix positions of the GEV-distributed data
# points and is used to transform vector data to matrix data (see function
# "plot_sel_map")

sel <- which(1*(ks_test_mod>0.05)*(ks_test_obs>0.05)==1)

non_gev_data <- 1-1*(ks_test_mod>0.05)*(ks_test_obs>0.05) # this is still a 
# matrix, it will be needed for the plotting of the clusters later
non_gev_data[is.na(non_gev_data)] <- 0

stat_par_obs <- stat_par_mod <- matrix(nrow=length(sel), ncol=3)

plot_sel_map <- function(vector_data, hatching_area=numeric(length(sel)),...) {
  # like function "plot_map", but with input data in vector form and given only
  # for the GEV-distributed data points  
  matrix <- nullmatrix*NA
  matrix[sel] <- vector_data
  hatching_area_matrix <- nullmatrix
  hatching_area_matrix[sel] <- hatching_area
  plot_map(matrix, hatching_area=hatching_area_matrix,...)
}

for(s in 1:length(sel)) {
  # matrix positions of element number s
  i <- (sel[s]-1)%%nrow(non_gev_data)+1
  j <- floor((sel[s]-1)/nrow(non_gev_data))+1
  # parameter estimation
  par <- egevd(pre_obs[i,j,],method="pwme")$parameters
  stat_par_obs[s,] <- par
  par <- egevd(pre_mod[i,j,],method="pwme")$parameters
  stat_par_mod[s,] <- par
}


plot_sel_map(stat_par_mod[,1], lo=0, hi=1000, truncate=T,
             main="Location Parameter – Climate Model")
plot_sel_map(stat_par_obs[,1], lo=0, hi=1000, truncate=T,
             main="Location Parameter – Observations")
plot_sel_map(stat_par_mod[,2], lo=0, hi=200, truncate=T,
             main="Scale Parameter – Climate Model")
plot_sel_map(stat_par_obs[,2], lo=0, hi=200, truncate=T,
             main="Scale Parameter – Observations")
# the definition of the shape parameter varies in the literature. in the
# envstats-package, it is the negative of the definition used in the paper 
plot_sel_map(-stat_par_mod[,3], lo=-0.8, hi=0.8, truncate=T,
             main="Shape Parameter – Climate Model")
plot_sel_map(-stat_par_obs[,3], lo=-0.8, hi=0.8, truncate=T,
             main="Shape Parameter – Observations")

# 
# # parametric bootstrap: confidence intervals for GEV parameters and quantiles
# # (may take some time to calculate)
# confint_obs <- confint_mod <- matrix(nrow=length(sel), ncol=6)
# colnames(confint_obs) <- colnames(confint_mod) <- c("location.lower",
#     "location.upper","scale.lower","scale.upper","shape.lower","shape.upper")
# r <- 2500 
# par_bs <- matrix(nrow=r, ncol=3)
# for(i in 1:length(sel)) {
#   for(r in 1:nrow(par_bs)) {
#     par_bs[r,] <- egevd(rgevd(90,location=stat_par_obs[i,1],scale=
#       stat_par_obs[i,2],shape=stat_par_obs[i,3]),method="pwme")$parameters
#   }
#   confint_obs[i,] <- c(quantile(par_bs[,1],probs=c(0.025,0.975)),
#                        quantile(par_bs[,2],probs=c(0.025,0.975)),
#                        quantile(par_bs[,3],probs=c(0.025,0.975)))
# }
# 
# for(i in 1:length(sel)) {
#   for(r in 1:nrow(par_bs)) {
#     par_bs[r,] <- egevd(rgevd(90,location=stat_par_mod[i,1],scale=
#       stat_par_mod[i,2],shape=stat_par_mod[i,3]),method="pwme")$parameters
#   }
#   confint_mod[i,] <- c(quantile(par_bs[,1],probs=c(0.025,0.975)),
#                        quantile(par_bs[,2],probs=c(0.025,0.975)),
#                        quantile(par_bs[,3],probs=c(0.025,0.975)))
# }
# 
# 
# plot_sel_map(stat_par_mod[,1]-stat_par_obs[,1],truncate=TRUE,
#              lo=-500, hi=500,hatching=TRUE,
#              hatching_area=1*(confint_obs[,1] <= stat_par_mod[,1])*
#                              (stat_par_mod[,1] <= confint_obs[,2])*
#                              (confint_mod[,1] <= stat_par_obs[,1])*
#                              (stat_par_obs[,1] <= confint_mod[,2]),
#              main="Location Parameter – Anomaly")
# plot_sel_map(stat_par_mod[,2]-stat_par_obs[,2],truncate=TRUE,
#              lo=-100,hi=100,hatching=TRUE,
#              hatching_area=1*(confint_obs[,3] <= stat_par_mod[,2])*
#                              (stat_par_mod[,2] <= confint_obs[,4])*
#                              (confint_mod[,3] <= stat_par_obs[,2])*
#                              (stat_par_obs[,2] <= confint_mod[,4]),
#              main="Scale Parameter – Anomaly")
# plot_sel_map(-stat_par_mod[,3]+stat_par_obs[,3],
#              lo=-1, hi=1,hatching=TRUE,
#              hatching_area=1*(confint_obs[,5] <= stat_par_mod[,3])*
#                              (stat_par_mod[,3] <= confint_obs[,6])*
#                              (confint_mod[,5] <= stat_par_obs[,3])*
#                              (stat_par_obs[,3] <= confint_mod[,6]),
#              main="Shape Parameter – Anomaly")
# 
# 
# # confidence intervals for the 95%-quantiles
# confint_quantile_obs <- confint_quantile_mod <- matrix(nrow=length(sel), ncol=2)
# 
# quantiles_bs <- numeric(r)
# for(i in 1:length(sel)) {
#   for(r in 1:nrow(par_bs)) {
#     param <- egevd(rgevd(90,location=stat_par_mod[i,1],scale=stat_par_mod[i,2],
#                          shape=stat_par_mod[i,3]),method="pwme")$parameters
#     quantiles_bs[r] <- qgevd(0.95, location=param["location"], 
#                              scale=param["scale"], shape=param["shape"])
#   }
#   confint_quantile_mod[i,] <- c(quantile(quantiles_bs,probs=c(0.025,0.975)))
# }
# for(i in 1:length(sel)) {
#   for(r in 1:nrow(par_bs)) {
#     param <- egevd(rgevd(90,location=stat_par_obs[i,1],scale=stat_par_obs[i,2],
#                          shape=stat_par_obs[i,3]),method="pwme")$parameters
#     quantiles_bs[r] <- qgevd(0.95, location=param["location"], 
#                              scale=param["scale"], shape=param["shape"])
#   }
#   confint_quantile_obs[i,] <- c(quantile(quantiles_bs,probs=c(0.025,0.975)))
# }
# 
# quantile_obs <- qgevd(0.95, location=stat_par_obs[,1], scale=stat_par_obs[,2],
#                       shape=stat_par_obs[,3])
# quantile_mod <- qgevd(0.95, location=stat_par_mod[,1], scale=stat_par_mod[,2],
#                       shape=stat_par_mod[,3])
# 
# plot_sel_map(quantile_mod-quantile_obs, lo=-500, hi=500, truncate=TRUE,
#              hatching = TRUE,
#              hatching_area =1*(confint_quantile_obs[,1] <= quantile_mod)*
#                               (quantile_mod <= confint_quantile_obs[,2])*
#                               (confint_quantile_mod[,1] <= quantile_obs)*
#                               (quantile_obs <= confint_quantile_mod[,2]),
#              main="95%-quantiles – Anomaly")


############################### Clustering ###################################

# calculate matrix of extremal coefficients based on the precipitation data
# the extremal coefficients are only calculated for gev-distributed data, and
# the vector "sel" is used to exclude all other data points
c_extrcoeff_matrix <- function(pre) {
  n_grid <- length(sel)
  n_obs <-  dim(pre)[3]
  # first calculate the ranks of each gev distributed time series
  rank_matrix <- matrix(nrow=n_grid, ncol=n_obs)
  for(s in 1:length(sel)) {
    # matrix positions of element number s
    i <- (sel[s]-1)%%nrow(pre)+1
    j <- floor((sel[s]-1)/nrow(pre))+1
    for (k in 1:n_obs) {
      rank_matrix[s,k] <- sum(pre[i,j,k]<=pre[i,j,1:n_obs])
    }
  }
  extrcoeff_matrix <- matrix(0, ncol=n_grid, nrow=n_grid)
  # calculate upper triangular matrix of extremal coefficients (minus 1)
  for(i in 1:(n_grid-1)) { for(j in (i+1):n_grid) {
    v = mean(abs(rank_matrix[i,1:n_obs]-rank_matrix[j,1:n_obs]))/(2*(n_obs+1))
    extrcoeff_matrix[i,j] = min(1, (1+2*v)/(1-2*v) -1 )
  }}
  return(extrcoeff_matrix+t(extrcoeff_matrix))
}

# calculate dispersion matrix based on extremal coefficients and GEV parameter
# differences
c_dispersion_matrix <- function(extrcoeff_matrix, stat_par, lambda=0.5) {
  dispersion_matrix <-  (1-lambda)*extrcoeff_matrix
  for(i in 1:3) {
    pardiff <- matrix(stat_par[,i],ncol=length(sel),nrow=length(sel))
    pardiff <- abs(pardiff-t(pardiff))
    pardiff <- pardiff/max(pardiff)
    dispersion_matrix <- dispersion_matrix + lambda*pardiff/3
  }
  return(dispersion_matrix)
}

clustering <- function(dist_matrix, method="average") {
  return(hclust(as.dist(t(dist_matrix), diag=TRUE),method=method))
}

# implementation of the L-Method to determine the optimal number of clusters
cluster_number_l_method <- function(hc, x_ax=20:500, title="") {
  rmse_min <- function(line=coph_kurz, axis=x_ax) {
    rmse_c <- function(c, line=line, axis=axis,plot=FALSE) {
      b <- length(line)
      l1 <- line[1:c] ; l2 <- line[(c+1):b] ; p1 <- 1:c ; p2 <- (c+1):b
      lm1 <- lm(l1 ~ p1)
      lm2 <- lm(l2 ~ p2)
      if(plot==TRUE) {
        plot(axis, line, type="l", xlab="Number of Clusters",
             ylab="Distance between Clusters")
        abline(a=lm1$coefficients[1]-lm1$coefficients[2]*(min(axis)-1),
               b=lm1$coefficients[2], col="red")
        abline(a=lm2$coefficients[1]-lm2$coefficients[2]*(min(axis)-1),
               b=lm2$coefficients[2], col="blue")
      }
      return( ((c-1)*sqrt(mean(lm1$residuals^2)) + 
                 (b-c)*sqrt(mean(lm2$residuals^2)))/(b-1) )
    }
    rme_vector <- numeric(length(line)-2)
    for ( i in 1:(length(line)-2)) {
      rme_vector[i] <- rmse_c(i+1, line=line, axis=axis)
    }
    min <- which.min(rme_vector)+min(axis)
    plot(x_ax[2:(length(x_ax)-1)], xlab="Number of Clusters", ylab="RMSE", 
         rme_vector, type="l")
    rmse_c(which.min(rme_vector)+1, line=line, axis=axis, plot=TRUE)
    return(min)
  }
  k_clust <- rmse_min(line=rev(hc$height)[x_ax], axis=x_ax)
  return(k_clust)
}

cluster_number_threshold_method <- function(hc, threshold) {
  return(length(which(hc$height>threshold)))
}


plot_cluster_map <- function(cluster_data, seed=1,main="", 
                             k_clust=max(cluster_data, na.rm=T)) {
  # as "plot_sel_map", but with a different colour for each cluster and without
  # a legend
  set.seed(seed) # to determine color ordering
  # colors are randomised to prevent adjacent clusters from having
  # similar colors
  plot_sel_map(cluster_data, lo=0, hi=k_clust, legend=FALSE,
           colors = sample(colorRampPalette(kelly(22))(k_clust)),
           exclude=non_gev_data+missing_values, main=main)
}

# compute dissimilarity matrices
d0_matrix_obs <- c_extrcoeff_matrix(pre_obs)
d0_matrix_mod <- c_extrcoeff_matrix(pre_mod)
d025_matrix_obs <- c_dispersion_matrix(d0_matrix_obs,stat_par_obs,lambda=0.25)
d025_matrix_mod <- c_dispersion_matrix(d0_matrix_mod,stat_par_mod,lambda=0.25)


# different dissimilarity matrices and methods to determine the cluster number
# can be chosen here

hc_mod   <- clustering(d0_matrix_mod)
hc_obs   <- clustering(d0_matrix_obs)
#hc_mod   <- clustering(d025_matrix_mod)
#hc_obs   <- clustering(d025_matrix_obs)

# k_clust_mod = cluster_number_l_method(hc_mod,x_ax=10:250)
k_clust_mod = cluster_number_threshold_method(hc_mod,threshold=0.825)
k_clust_obs = cluster_number_threshold_method(hc_obs,threshold=0.825)
# k_clust_obs = cluster_number_threshold_method(hc_obs,threshold=0.65)

# using the matrix and the number of clusters, the clusterings are computed
clusters_mod <- cutree(hc_mod, k_clust_mod)
#plot_cluster_map(clusters_mod)
clusters_obs <- cutree(hc_obs, k_clust_obs)
#plot_cluster_map(clusters_obs)


################# Match the colors of the two clusterings ######################

# the colors of the two clusterings (for model and observational data) are
# matched in order to better see the similarities and differences

# the color matching is based on the percentage of the common area of clusters
# first, create a matrix with the area of each grid cell (in km^2)
# lon-values are evenly distributed
# lat_values: assume that the boundaries of a grid cell with lat-value lat[i]
# are (lat[i]+lat[i+1])/2 and (lat[i]+lat[i-1])/2 
area_lat <- numeric(length(lat))
lat_ext <- c(-90,lat,90)
for(i in 1:length(lat)) {
  area_lat[i] <- 2*pi*6371^2*abs(sin(pi/180*(lat_ext[i+2]+lat_ext[i+1])/2)-
                  sin(pi/180*(lat_ext[i+1]+lat_ext[i])/2))
}
area <- t(matrix(area_lat/length(lon),ncol=length(lon),nrow=length(lat)))


match_corresponding_clusters <- function(clusters_1, clusters_2) {
  # this function takes the two clusterings to be matched and returns
  # a vector with three rows. the first row contains the cluster numbers of the
  # first clustering, the second row the ones of the second matrix, and the
  # third row contains a new cluster number. clusters at a similar position in
  # both clusterings are assigned the same new value
  
  kcl_1 <- max(clusters_1) ; kcl_2 <- max(clusters_2)  # total cluster amount
  cluster_map_1 <- cluster_map_2 <- nullmatrix
  cluster_map_1[sel] <- clusters_1; cluster_map_2[sel] <- clusters_2
  
  # calculate for all pairs of clusters the percentage of their common area
  # first row: cluster number in "clusters_1", second row: cluster number in
  # "clusters_2", third row: common area in percent
  cluster_common_area <- matrix(nrow=kcl_1*kcl_2,ncol=3)
  cnt <- 0
  for(i in 1:kcl_1) for(j in 1:kcl_2) {
    cnt <- cnt + 1
    cluster_common_area[cnt,] <- c(i,j,
      sum((cluster_map_1==i)*(cluster_map_2==j)*area)/
      (sum((cluster_map_1==i)*area)+sum((cluster_map_2==j)*area)-
        sum((cluster_map_1==i)*(cluster_map_2==j)*area))*100)
  }
  # exclude cluster pairs with common area less than 30%, sort by common area 
  cluster_common_area <- cluster_common_area[
    which(cluster_common_area[,3]>30),]
  cluster_common_area <- cluster_common_area[order(-cluster_common_area[,3]),]
  
  # retain only pairs for which both clusters occur the first time in the list
  drop <- c()
  for(i in 2:dim(cluster_common_area)[1]) {
    if ((cluster_common_area[i,1] %in% cluster_common_area[1:(i-1),1])) {
      drop <- c(drop,i)
    }
    else if ((cluster_common_area[i,2] %in% cluster_common_area[1:(i-1),2])) {
      drop <- c(drop,i)
    }
  }
  cluster_pairs <- cluster_common_area[-drop, 1:2]
  
  # extend list by all still missing clusters (corresponding to NA)
  clusters_left_1 <- (1:kcl_1)[-cluster_pairs[,1]]
  clusters_left_2 <- (1:kcl_2)[-cluster_pairs[,2]]
  
  cluster_pairs <- rbind(cluster_pairs, 
                      cbind(clusters_left_1,rep(NA,length(clusters_left_1))),
                      cbind(rep(NA,length(clusters_left_2)),clusters_left_2))
  # add new cluster number in a third column
  cluster_pairs <- cbind(cluster_pairs, 1:(nrow(cluster_pairs)))
  return(cluster_pairs)
}

change_cluster_colors <- function(cluster_matrix, cluster_pairs) {
  for(i in 1:length(cluster_matrix)) {
    cluster_matrix[i] <- cluster_pairs[which(
          cluster_pairs[,1]==cluster_matrix[i]),2]
  }
  return(cluster_matrix)
}

cluster_pairs <- match_corresponding_clusters(clusters_mod, clusters_obs)
clusters_mod_match <- change_cluster_colors(clusters_mod, cluster_pairs[,-2])
clusters_obs_match <- change_cluster_colors(clusters_obs, cluster_pairs[,-1])

plot_cluster_map(clusters_mod_match, k_clust=dim(cluster_pairs)[1],
                 main="Clusters – Climate Model")
plot_cluster_map(clusters_obs_match, k_clust=dim(cluster_pairs)[1],
                 main="Clusters – Observations")
