# load libraries
library(plyr)
library(lme4)
## Loading required package: Matrix
library(ggplot2)
library(raster)
## Loading required package: sp

The measurement data

Airbase query retrieving validated, verified annual average PM2.5 values: http://aidef.apps.eea.europa.eu/?source=%7B%22query%22%3A%7B%22bool%22%3A%7B%22must%22%3A%7B%22bool%22%3A%7B%22must%22%3A%5B%7B%22term%22%3A%7B%22Validity%22%3A%22Valid%22%7D%7D%2C%7B%22term%22%3A%7B%22AggregationType%22%3A%22Annual%20mean%20%2F%201%20calendar%20year%22%7D%7D%2C%7B%22term%22%3A%7B%22Verification%22%3A%22Verified%22%7D%7D%5D%7D%7D%2C%22filter%22%3A%7B%22bool%22%3A%7B%22should%22%3A%5B%7B%22term%22%3A%7B%22Pollutant%22%3A%22Particulate%20matter%20%3C%202.5%20%C2%B5m%20(aerosol)%22%7D%7D%5D%7D%7D%7D%7D%2C%22display_type%22%3A%22tabular%22%7D

Airbase query retrieving information about the stations where PM2.5 is measured: http://aided.apps.eea.europa.eu/?source=%7B%22query%22%3A%7B%22bool%22%3A%7B%22must%22%3A%7B%22match_all%22%3A%7B%7D%7D%2C%22filter%22%3A%7B%22bool%22%3A%7B%22should%22%3A%5B%7B%22term%22%3A%7B%22Pollutant%22%3A%22Particulate%20matter%20%3C%202.5%20%C2%B5m%20(aerosol)%22%7D%7D%5D%7D%7D%7D%7D%2C%22display_type%22%3A%22tabular%22%7D

Data are loaded and station data are coupled to measurements.

rm(list = ls())
wd <- "D:/SHERPA/Chimere_Emep_Comparison/CTM_validation"
setwd(wd)

# read Airbase PM2.5 measurements
ab.pm25.csv <- "Airbase PM25 measurements.csv"
ab.pm25.df <- read.table(ab.pm25.csv, sep = ",", quote = '"', header = TRUE)
names(ab.pm25.df)[1] <- "Country" # replace  "ï..CountryOrTerritory"
names(ab.pm25.df)[which(names(ab.pm25.df) == "SamplingPointLocalId")] <- "SamplingPointId"
# NROW(ab.pm25.df)

# remove 'not valid' data. (In the query only valid was selected, though!)
ab.pm25.df <- ab.pm25.df[ab.pm25.df$Validity == "Valid",]

# read Airbase PM2.5 stations
ab.pm25.stations.csv <- "Airbase PM25 stations.csv"
stations.df <- read.table(ab.pm25.stations.csv, sep = ",", quote = '"', header = TRUE)
names(stations.df)[1] <- "Country" # replace  "ï..CountryOrTerritory"
stations.df <- unique(stations.df[, c("Country", "SamplingPointId", "Latitude", "Longitude", "StationType", "StationArea")])
# NROW(stations.df)

# couple the data on "SamplingPointId" (called "SamplingPointLocalId" in the AQ dataset)
ab.pm25.df <- merge(ab.pm25.df, stations.df, by = c("SamplingPointId", "Country"))
# NROW(ab.pm25.df)
ab.pm25.all.df <- ab.pm25.df

# Also older data are available on the EEA site
# ----------------------------------------------
Airbase_v7_statistics.file <- "AirBase_v7_statistics/AirBase_v7_statistics.csv"
ab.v7.df <- read.table(Airbase_v7_statistics.file, header = T, sep = "\t")
# select only PM2.5
ab.v7.df <- ab.v7.df[ab.v7.df$component_name == "Particulate matter < 2.5 µm (aerosol)" & ab.v7.df$statistic_name == "annual mean",]
NROW(ab.v7.df)
## [1] 0
# range(ab.v7.df$statistics_year)
# table(ab.v7.df$statistics_year)
# Many data are available for 2009 to 2011 but the station codes do not correspond to the ones in the e-reporting.

# read stations for old data
Airbase_v7_statistics.file <- "AirBase_v7_stations/AirBase_v7_stations.csv"
ab.v7.stations.df <- read.table(Airbase_v7_statistics.file, header = T, sep = "\t", quote = '"')

# add the station info to the measurements
ab.v7.df <- merge(ab.v7.df, ab.v7.stations.df)

Check the number of stations per year. Only from 2013 onwards a lot of stations are available in the e-reporting. The years 2011 and 2012 are excluded from the trend analysis. Also stations for which not all 6 years are available are excluded.

# data per year
table(ab.pm25.df$ReportingYear)
## 
## 2011 2012 2013 2014 2015 2016 2017 2018 
##   10    7  862 1087 1151 1324 1442 1474
# exclude 2011 and 2012 for trend analysis
ab.pm25.df <- ab.pm25.df[ab.pm25.df$ReportingYear > 2012,]

# count measurements per station
meas.per.station <- ddply(ab.pm25.df, c("SamplingPointId"), summarize, n.meas = NROW(AQValue))

There are enough stations for which 6 years are available. Data are normalized with the average concentration. We want to see the trends, not absolute values.

# number of stations with 1, 2, ...6 measurements
table(meas.per.station$n.meas)
## 
##   1   2   3   4   5   6 
## 225 197 223 257 310 579
# take only complete time series
ab.pm25.df <- ab.pm25.df[ab.pm25.df$SamplingPointId %in% meas.per.station$SamplingPointId[meas.per.station$n.meas == 6],]

# normalize with first year (2013)

# remove stations with zero annual average (24)
zero.stations <- as.vector(unique(ab.pm25.df$StationLocalId[ab.pm25.df$AQValue == 0]))
ab.pm25.df <- ab.pm25.df[!(ab.pm25.df$StationLocalId %in% zero.stations),]

# calculate mean concentration
ab.pm25.mean.df <- ddply(ab.pm25.df, c("SamplingPointId"), summarise, AQValueMean = mean(AQValue))

# attach them to the original dataset
ab.pm25.df <- merge(ab.pm25.df, ab.pm25.mean.df, by = "SamplingPointId")

# normalize concentrations with the average value
ab.pm25.df$AQValueNorm <- ab.pm25.df$AQValue / ab.pm25.df$AQValueMean

# substract 2013 from all the reporting years
ab.pm25.df$ReportingYear <- ab.pm25.df$ReportingYear - 2013

Trend analysis

There are 567 Sample points left with valid data for 6 years. First a simple linear model is fitted for the normalized AQValue as a function of ReportingYear. For this model the residuals are checked against fitted values and other covariates.

Fit a linear model

# fit a linear model
pm25.lm <- lm(AQValueNorm ~ ReportingYear, data = ab.pm25.df)
summary(pm25.lm)
## 
## Call:
## lm(formula = AQValueNorm ~ ReportingYear, data = ab.pm25.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98836 -0.06437 -0.00717  0.06049  0.63831 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.054482   0.003194  330.17   <2e-16 ***
## ReportingYear -0.021793   0.001055  -20.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1062 on 3472 degrees of freedom
## Multiple R-squared:  0.1095, Adjusted R-squared:  0.1092 
## F-statistic: 426.8 on 1 and 3472 DF,  p-value: < 2.2e-16
pm25.lm.res <- data.frame(ab.pm25.df[, c("ReportingYear", "AQValueNorm", "SamplingPointId", "StationType", "Country")],
                          fitted = fitted.values(pm25.lm), resid = resid(pm25.lm))

p <- ggplot(aes(x=fitted, y=resid), data = pm25.lm.res)
p <- p + geom_point() + geom_smooth(method = 'lm')
print(p)
## `geom_smooth()` using formula 'y ~ x'

p <- ggplot(aes(x=factor(ReportingYear), y=resid), data = pm25.lm.res)
p <- p + geom_boxplot() + labs(x = "Year since 2013", y = "Residuals", title = "Residuals vs Reporting year")
print(p)

# A model with StationType was tested but it was not significant
p <- ggplot(aes(x=StationType, y=resid), data = pm25.lm.res)
p <- p + geom_boxplot() + labs(x = "Year since 2013", y = "Residuals", title = "Residuals vs StationType")
print(p)

# plot residuals versus Station per country
p <- ggplot(aes(x=SamplingPointId, y=resid), data = pm25.lm.res[pm25.lm.res$Country == "Italy",])
p <- p + geom_boxplot() + labs(x = "Sampling point", y = "Residuals", title = "Residuals vs Sampling point") + geom_abline(slope = 0, intercept = 0, col = "red")
print(p)

There is a trend of -2.2% per year. StationType has no significant effect on this trend. Residuals per Sampling Point are all quite well centred around zero.

Fit a linear mixed model

Though there are no strong trends in residuals per SamplingPoint it is interesting to know the spread around the average trend, i.e. the spread around the average slope. A mixed model is fitted with random effects for intercept and slope around the genereal trend.

# fit mixed model
pm25.lme1 <- lmer(AQValueNorm ~ ReportingYear + (ReportingYear | SamplingPointId), 
                 data = ab.pm25.df)
## boundary (singular) fit: see ?isSingular
summary(pm25.lme1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: AQValueNorm ~ ReportingYear + (ReportingYear | SamplingPointId)
##    Data: ab.pm25.df
## 
## REML criterion at convergence: -5930.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -10.1426  -0.5942  -0.0768   0.5669   4.9876 
## 
## Random effects:
##  Groups          Name          Variance  Std.Dev. Corr 
##  SamplingPointId (Intercept)   0.0046543 0.06822       
##                  ReportingYear 0.0007447 0.02729  -1.00
##  Residual                      0.0091049 0.09542       
## Number of obs: 3474, groups:  SamplingPointId, 579
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)    1.054482   0.004034  261.38
## ReportingYear -0.021793   0.001478  -14.74
## 
## Correlation of Fixed Effects:
##             (Intr)
## ReportingYr -0.916
## convergence code: 0
## boundary (singular) fit: see ?isSingular

The standard deviation of the slope is 2.7%. But there is a warning that the model matrix is close to singular. Therefor a simpler analysis is done below. A linear model is fitted per station and than the distribution of the slopes per station is visualized to get an idea of the spread.

pm25.list <- lmList(AQValueNorm ~ ReportingYear | SamplingPointId, data = ab.pm25.df)
coefs.pm25.list <- coefficients(pm25.list)

quantile(coefs.pm25.list$`(Intercept)`, probs = c(0.025, 0.25, 0.5, 0.75, 0.975))
##      2.5%       25%       50%       75%     97.5% 
## 0.8538356 1.0132439 1.0644658 1.1053497 1.2235878
quantile(coefs.pm25.list$ReportingYear, probs = c(0.025, 1/6, 0.25, 0.5, 0.75, 5/6, 0.975))
##         2.5%    16.66667%          25%          50%          75%    83.33333% 
## -0.089435130 -0.048771620 -0.042139896 -0.025786334 -0.005297546  0.006726362 
##        97.5% 
##  0.058465757

Two thirds of the slopes lie between -4.9% and +0.7%. This corresponds with the standard deviation on the slope of 2.7% found in the mixed model.

Conclusion

There is a considerable trend in the PM2.5 concentration data of -2.2% per year. There are 5 years between the model years (2009, 2014). This period does not correspond to the data analysed here.

Validation of the base years

CHIMERE validation

# # This is CRAZY run this first and than read the result from the file in a second run
# # the column 'PM25.chimere' is not added or an older cashed variable is read
# chimere.2009.nc <- "D:/handover/SHERPA_PM25_Atlas/20170322_v18_SrrResults_PotencyBased/2_base_concentrations/BC_conc_PM25_Y.nc"
# chimere.2009.r <- raster(chimere.2009.nc, varname = 'conc')
# 
# # select data for 2009 for Background, Industrial and Traffic stations. (Remove Unknown)
# ab.2009.df <- ab.v7.df[ab.v7.df$statistics_year == 2009 & ab.v7.df$type_of_station != "Unknown",
#                        c("country_iso_code", "type_of_station", "statistic_value", "station_longitude_deg", "station_latitude_deg")]
# 
# # look up CHIMERE prediction for each station
# station.coords <- as.matrix(ab.2009.df[,c("station_longitude_deg", "station_latitude_deg")])
# ab.2009.df$PM25.chimere <- extract(chimere.2009.r, station.coords)
# # remove stations for which no model value exists (overseas areas)
# ab.2009.df <- ab.2009.df[!(is.na(ab.2009.df$PM25.chimere)),]
# 
# write.table(ab.2009.df, "airbase2009_chimere.csv", sep = ",", row.names = F)
chimere.data <- read.table("airbase2009_chimere.csv", header=T, sep=",")
# chimere.data <- ab.2009.df 

# fit a linear model
lm.chimere <- lm(formula = formula("statistic_value ~ 0 + PM25.chimere + type_of_station"),
                 data = chimere.data)
summary(lm.chimere)
## 
## Call:
## lm(formula = formula("statistic_value ~ 0 + PM25.chimere + type_of_station"), 
##     data = chimere.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.761  -2.962  -0.904   1.548  60.498 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## PM25.chimere               1.04600    0.04047  25.849  < 2e-16 ***
## type_of_stationBackground  1.18770    0.60972   1.948  0.05167 .  
## type_of_stationIndustrial  2.25949    0.72832   3.102  0.00197 ** 
## type_of_stationTraffic     2.70468    0.67397   4.013 6.38e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.681 on 1152 degrees of freedom
## Multiple R-squared:  0.8998, Adjusted R-squared:  0.8995 
## F-statistic:  2588 on 4 and 1152 DF,  p-value: < 2.2e-16
# the intercept is not significant

# add residuals to the data
chimere.data$residuals <-  residuals(lm.chimere)
chimere.data$fitted.values <-  fitted.values(lm.chimere)

# residuals versus fitted values
p <- ggplot(aes(x=fitted.values, y=residuals), data = chimere.data) + geom_point()
print(p)

# residuals versus station type
p <- ggplot(aes(x=type_of_station, y=residuals), data = chimere.data) + geom_boxplot()
print(p)

# residuals versus country
p <- ggplot(aes(x=country_iso_code, y=residuals), data = chimere.data) + geom_boxplot()
print(p)

Residual plots look good. Only for a few small periferal countries the residuals are not centred around zero. The CHIMERE model underestimates the concentration by 4.4% and the standard error is 5.7 g/m^3. The residual error is rather constant over the range of concentrations.

sim.df <- data.frame(PM25.chimere = 1:74, type_of_station = "Background")
PM25pred <- predict.lm(lm.chimere, newdata = sim.df, se.fit = T, type = "response")
sim.df$fit <- PM25pred$fit
sim.df$se.fit <- PM25pred$se.fit

p <- ggplot(aes(x=PM25.chimere, y=fit), data = sim.df) + geom_line()
p <- p + geom_abline(intercept = 0, slope = 1, col = "red")
p <- p + geom_point(data = chimere.data[chimere.data$type_of_station == "Background",], aes(x=PM25.chimere, y=statistic_value))
p <- p + lims(x = c(0, 30), y = c(0, 30)) + theme(text = element_text(size=15))
p <- p + labs(x = "Modelled PM2.5 w/ CHIMERE (ug/m3)", y = "Measured PM2.5", 
              title = "Modeled (CHIMERE)\nvs. Measured PM2.5 for background stations")
p 
## Warning: Removed 47 row(s) containing missing values (geom_path).
## Warning: Removed 40 rows containing missing values (geom_point).

png("CHIMERE_validation.png")
print(p)
## Warning: Removed 47 row(s) containing missing values (geom_path).

## Warning: Removed 40 rows containing missing values (geom_point).
dev.off()
## png 
##   2

EMEP

Fit a linear model

# read the EMEP data
emep.2014.nc <- "D:/handover/SHERPA GUI models/emep_10km_fua/base_concentrations/BC_conc_PM25_Y.nc"
emep.2014.r <- raster(emep.2014.nc, varname = 'conc')
## Loading required namespace: ncdf4
# select data for 2014 for Background, Industrial and Traffic stations. 
ab.2014 <- ab.pm25.all.df[ab.pm25.all.df$ReportingYear == 2014,]

# look up CHIMERE prediction for each station
station.coords <- as.matrix(ab.2014[, c("SamplingPoint_Longitude", "SamplingPoint_Latitude")])
ab.2014$PM25.emep <- extract(emep.2014.r, station.coords)
# remove stations for which no model value exists (overseas areas)
ab.2014 <- ab.2014[!is.na(ab.2014$PM25.emep),]

# fit a linear model
lm.emep <- lm(AQValue ~ PM25.emep + StationType, data = ab.2014)
summary(lm.emep)
## 
## Call:
## lm(formula = AQValue ~ PM25.emep + StationType, data = ab.2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.4066  -2.6559  -0.7713   1.6441  23.2328 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            5.57588    0.39609  14.077  < 2e-16 ***
## PM25.emep              0.76612    0.03068  24.969  < 2e-16 ***
## StationTypeIndustrial -1.32050    0.50573  -2.611  0.00916 ** 
## StationTypeTraffic     0.90530    0.32593   2.778  0.00557 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.446 on 1035 degrees of freedom
## Multiple R-squared:  0.3829, Adjusted R-squared:  0.3811 
## F-statistic:   214 on 3 and 1035 DF,  p-value: < 2.2e-16
# add residuals to the data
ab.2014$residuals <-  residuals(lm.emep)
ab.2014$fitted.values <-  fitted.values(lm.emep)

# residuals versus fitted values
p <- ggplot(aes(x=fitted.values, y=residuals), data = ab.2014) + geom_point() + geom_smooth()
print(p)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

There is a strong trend in the residuals versus fitted values. A straight line between modelled and measured concentration doesn’t work here.

Fit a log-log model

# fit a log-log linear model
lm.log.emep <- lm(log(AQValue) ~ log(PM25.emep) + StationType, data = ab.2014)
summary(lm.log.emep)
## 
## Call:
## lm(formula = log(AQValue) ~ log(PM25.emep) + StationType, data = ab.2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6223 -0.1633 -0.0145  0.1487  1.1314 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.03128    0.05221  19.754  < 2e-16 ***
## log(PM25.emep)         0.65664    0.02145  30.614  < 2e-16 ***
## StationTypeIndustrial -0.07163    0.03179  -2.253   0.0245 *  
## StationTypeTraffic     0.09057    0.02049   4.420 1.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2795 on 1035 degrees of freedom
## Multiple R-squared:  0.4822, Adjusted R-squared:  0.4807 
## F-statistic: 321.3 on 3 and 1035 DF,  p-value: < 2.2e-16
# add residuals to the data
ab.2014$residuals.loglm <-  residuals(lm.log.emep)
ab.2014$fitted.values.loglm <-  fitted.values(lm.log.emep)

# residuals versus fitted values
p <- ggplot(aes(x=fitted.values.loglm, y=residuals.loglm), data = ab.2014) + geom_point() + geom_smooth()
print(p)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# This is much better

# residuals versus station type
p <- ggplot(aes(x=StationType, y=residuals.loglm), data = ab.2014) + geom_boxplot() + lims(y = c(-2.5, 2.5))
print(p)

# residuals versus country
p <- ggplot(aes(x=Country, y=residuals.loglm), data = ab.2014) + geom_boxplot() 
print(p)

# quite some spatial correlation too.

The log-log model shows no trends in the residuals versus fitted values. Also the residuals per StationType are ok. There are some countries with residuals that are not centred around zero (some spatial component should be added).

This means that to use the model to predict an annual average PM2.5 concentration this is best done by appling the following power law for a background location:

PM2.5_measured = 2.8 * (PM2.5_EMEP) ^ 0.66

For Traffic stations the formula above has to be multiplied with 1.09. At traffic stations the model underestimates the measurements with about 9.5%. For Traffic stations the formula above has to be multiplied with 0.93 but this model parameter is not significant. At industrial stations the model overestimates the measurements with about 6.9%.

The residual standard error of the log model is 0.28. This means that the predictions have a 95% confidence interval between 0.57 and 1.75 times the predicted value.

sim.df <- data.frame(PM25.emep = 1:74, StationType = "Background")
PM25pred <- predict.lm(lm.log.emep, newdata = sim.df, se.fit = T, type = "response")
sim.df$fit <- exp(PM25pred$fit)
sim.df$se.fit <- PM25pred$se.fit

p <- ggplot(aes(x=PM25.emep, y=fit), data = sim.df) + geom_line()
p <- p + geom_abline(intercept = 0, slope = 1, col = "red")
p <- p + geom_point(data = ab.2014[ab.2014$StationType == "Background",], aes(x=PM25.emep, y=AQValue))
p <- p + lims(x = c(0, 30), y = c(0, 30)) + theme(text = element_text(size=15))
p <- p + labs(x = "Modelled PM2.5 w/ EMEP (ug/m3)", y = "Measured PM2.5", 
              title = "Modeled (EMEP) vs.\nMeasured PM2.5 for background stations")
p
## Warning: Removed 44 row(s) containing missing values (geom_path).
## Warning: Removed 17 rows containing missing values (geom_point).

png("EMEP_validation.png")
print(p)
## Warning: Removed 44 row(s) containing missing values (geom_path).

## Warning: Removed 17 rows containing missing values (geom_point).
dev.off()
## png 
##   2

The power law describing the relation between model and measurements does not differ so much from a straight line but heteroskedasticity is captured much better. The EMEP model has a relative standard error between -24 and 32. Industrial stations are overpredicted by 7%. Traffic stations are underpredicted by 9%.