###
###  PURPOSE:  Example of model building towards linear model
###            in a situation in which the main task is to determine
###            which factors (out of a larger group of possible explanatory variables)
###            are associated the most with the outcome.
###            More or less the same strategy would be recommended if
###            the prediction model is the primary aim.
###
###            The script below has been motivated by the analyzis
###            which results are presented in the manuscript:
###        
###            Ondřej Hradský and Arnošt Komárek (2020).
###            Demographic and public health characteristics explain large part
###            of variability in COVID-19 mortality across countries.
###            To appear in: European Journal of Public Health.
###
###            The final model found here is a bit different from that presented in the paper
###            due to the fact that for the paper, the model was built using data that include Singapore.
###            Here, Singapore is removed from data before the model building stage for reasons
###            mentioned below. Nevertheless, main conclusions based on model found here
###            do not considerably differ from those presented in the paper.
###
###  OUTCOME VARIABLE: Reported mortality (number of deaths per million of inhabitants) on/with COVID-19
###                    per country as of 28 May 2020 (~ after the "first" COVID-19 wave).
###
###  EXPLANATORY VARIABLES: various demographic, geographic and public health characteristics of countries
###                         - see an extra document for their list as well as for the list of analyzed countries
###
###
###  PROGRAMMER:  Arnošt Komárek
###  LOG:         created on 22/11/2020 for the purpose of the Linear Regression lectures at MFF UK             
###
### ============================================================================================================

rm(list = ls())
ROOT  <- "/home/komarek/teach/mff_2020/nmsa407_LinRegr/RkoModelBuild/"
setwd(ROOT)


##### Load a "clean" dataset (was already pre-processed)
##### ================================================================
(load(paste(ROOT, "/Data/COVID4mff.RData", sep = "")))

dim(Data)
head(Data)
summary(Data)

    ### Numeric (or 0/1) explanatory variables
xVar  <- c("Male80", "Female80", "Popul80", "Popul65", "PopulMid", "PopulData", "PopulDens",
           "Female", "Urban", "LifeExpect", "MortNeontl", "MortDis",
           "CauseInjry", "CauseNonCm", "CauseCommn", "GDP", "TBC", "Diabet",
           "Obesity", "HIV", "Smoking", "Physicns", "Beds", "ImmunMeas",
           "TempMarch", "HT.men", "HT.women", "HT", "BCG", "Time")

    ### Categorical explanatory variables
zVar  <- c("fBCG", "Haplo")       ### fBCG is a factor that coincides with 0/1 BCG 'numeric' variable

    ### Outcome variables
yVar  <- c("deaths.permil", "l10deaths.permil")     ### l10deaths.permil = log10(1 + deaths.permil)


##### Very basic exploration (much more is needed to get insight to data)
##### Here only steps related to model building are shown
##### =====================================================================

    ### For several reasons, it is useful if the numeric explanatory variables have a similar scale
    ### (std. deviations of a similar magnitude)
    ### ---------------------------------------------------------------------------------------------
sapply(subset(Data, select = xVar), sd, na.rm = TRUE)
         ### Seems to be OK here -> well, this problem has already been treated in a "cleaning" stage...

    ### How many missing values do we have on each variable?
    ### ---------------------------------------------------------------------------------------------
sapply(lapply(subset(Data, select = c(yVar, xVar, zVar)), is.na), sum)

        ### Highest counts of missing values out of 210 (= sample size)
        ###   Haplo:     155
        ###   Obesity:   102
        ###   HIV:        72
        ###   Smoking:    70
        ### ----------------------
        ###   BCG:        50
        ###
        ### Haplo, Obesity, HIV, Smoking clearly problematic (1/3 or more missing).
        ### Even though they might be associated with the outcome, it is not possible
        ### to evaluate their effect using this dataset. In the following,
        ### Haplo, Obesity, HIV, Smoking are no more considered as explanatory variables.
(xVar  <- xVar[!(xVar %in% c("Haplo", "Obesity", "HIV", "Smoking"))])
zVar  <- "fBCG"

    ### How many coutries (do not) have all variables of interest available?
    ### -----------------------------------------------------------------------
(nMISS  <- apply(as.data.frame(lapply(subset(Data, select = c(yVar, xVar, zVar)), is.na)), 1, sum))
sum(nMISS == 0)      ## 139
sum(nMISS > 0)       ##  71

      ### For model building, we need all considered variables available for all countries.
      ### Which countries must be dropped from data?
table(nMISS)
subset(Data, nMISS > 0, select = c("geoId", "CntrName"))
           ### If those countries not included, not that much is lost.
           ### Most of them are either very small or with a bit questionable COVID reporting.

    ### Dataset for the analyzis
    ### ---------------------------------
DataAna  <- subset(Data, nMISS == 0)
dim(DataAna)
apply(as.data.frame(lapply(subset(DataAna, select = c(yVar, xVar, zVar)), is.na)), 1, sum)     ### just to check...


##### Marginal associations of covariates to outcome
##### --> more just as exploratory
##### =================================================

    ### Categorical covariate
par(mfrow = c(1, 2))
plot(deaths.permil ~ fBCG, data = DataAna, col = "chocolate1")
plot(l10deaths.permil ~ fBCG, data = DataAna, col = "chocolate1")
        ### For the following, remember that we have only one categorical variable fBCG
        ### which is included also among the numeric variables as 0/1 BCG.

    ### Numeric covariates
    ### - here only against log10(deaths.permil) which appears later
    ###   as more suitable as outcome rather than deaths.permil itself
length(xVar)

par(mfrow = c(3, 3))
for (i in 1:9){
  FORM  <- formula(paste("l10deaths.permil ~", xVar[i]))  
  plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold")
  abline(lm(FORM, data = DataAna), col = "red2", lwd = 1)
}    

        ### Leverages by just one variable?
subset(DataAna, PopulDens > 20)      ## Singapore
subset(DataAna, PopulData > 4)       ## China, India
subset(DataAna, Female < 45)         ## Equatorial_Guinea, Kuwait, Oman, Qatar, Saudi Arabia, United Arab Emirates

      ### +++++++++++++++++++++++++++++++

par(mfrow = c(3, 3))
for (i in 10:18){
  FORM  <- formula(paste("l10deaths.permil ~", xVar[i]))  
  plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold")
  abline(lm(FORM, data = DataAna), col = "red2", lwd = 1)
}    

par(mfrow = c(3, 3))
for (i in 19:27){
  FORM  <- formula(paste("l10deaths.permil ~", xVar[i]))  
  plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold")
  abline(lm(FORM, data = DataAna), col = "red2", lwd = 1)
}    


##### Leverages
##### =================================================
m0  <- lm(formula(paste("l10deaths.permil ~", paste(xVar, collapse = "+"))), data = DataAna)
hh  <- hatvalues(m0)
hh[order(hh)]
      ## --> really extreme for Singapore (h = 0.96)
      ## --> from previous, we already know why,
      ##     it has quite extreme population density as compared to the rest of data

popDens  <- DataAna[, "PopulDens"]
names(popDens)  <- DataAna[, "geoId"]
popDens[order(popDens)]

      ## To avoid possibly dangerous influence of Singapore on results,
      ## we remove it from the analyzis dataset. Here, an explanation
      ## might be that we want to restrict our attention on just countries
      ## with a "reasonable" population density (of at most 13*100 people/km2).

DataAna  <- subset(DataAna, geoId != "SG")
dim(DataAna)

     ## How about the leverages now?
     ## (centroid of the covariate values has changed by removal of Singapore)
m1  <- lm(formula(paste("l10deaths.permil ~", paste(xVar, collapse = "+"))), data = DataAna)
hh1  <- hatvalues(m1)
hh1[order(hh1)]
          ### pretty OK


##### Multicollinearity
##### =================================================
car::vif(m1)
summary(m1)
        ### Hmmm, we forgot that Popul80 = 0.5*(Male80 + Female80)
        ###                      HT = 0.5*(HT.men + HT.women)

xVar2  <- xVar[!(xVar %in% c("Popul80", "HT"))]
m2  <- lm(formula(paste("l10deaths.permil ~", paste(xVar2, collapse = "+"))), data = DataAna)
car::vif(m2)
summary(car::vif(m2))
        ### We seem to have BIG problems (as expected if we think about the variables).


     ### Pairwise associations to reveal the most profound problems  
     ### -------------------------------------------------------------     
psych::pairs.panels(subset(DataAna, select = xVar2),
         bg = "palegoldenrod", col = "red3", pch = 21,
         ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE)
           ### a bit too much...

RR  <- cor(subset(DataAna, select = xVar2))
print(RR)

highR1  <- data.frame(x = character(0), y = character(0), r = numeric(0))
highR2  <- data.frame(x = character(0), y = character(0), r = numeric(0))
highR3  <- data.frame(x = character(0), y = character(0), r = numeric(0))
for (j in 1:(ncol(RR) - 1)){
  for (i in (j+1):nrow(RR)){
    if (abs(RR[i, j]) > 0.9) highR1  <- rbind(highR1, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j]))
    if (abs(RR[i, j]) <= 0.9 & abs(RR[i, j]) > 0.8) highR2  <- rbind(highR2, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j]))
    if (abs(RR[i, j]) <= 0.8 & abs(RR[i, j]) > 0.7) highR3  <- rbind(highR3, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j]))    
  }      
}    

print(highR1)
print(highR2)
print(highR3)

par(mfrow = c(2, 2))
for (i in 1:nrow(highR1)){
  plot(DataAna[, highR1[i, "x"]], DataAna[, highR1[i, "y"]], xlab = highR1[i, "x"], ylab = highR1[i, "y"], pch = 23, col = "red3", bg = "gold")
  title(main = paste("r = ", format(round(highR1[i, "r"], 2), nsmall = 2)))  
}    

     ### Actions:
     ###    (Male80, Female80, Popul65) --> Popul80 = 0.5*(Male80 + Female80)
     ###                  which represents age of the population
     ###
     ###    cor(CauseCommn, CauseNonCm) = -0.98
     ###       at the same time, both CauseCommn and CauseNonCm highly correlated with MortNeontl and LifeExpect
     ###       -> drop both CauseCommn and CauseNonCm (they have somehow inclear interpretation anyway)
     ###       -> keep MortNeontl and LifeExpect even though they are also quite correlated (-0.867)
     ###   
     ###    (HT.women, HT.men) --> HT = 0.5*(HT.women + HT.men)
     ###
     ###    Do not care about cor(Physicns, Female80) = 0.81 now, also do not care about other <0.80 correlations.

xVar3  <- c("Popul80", "PopulMid", "PopulData", "PopulDens", "Female", "Urban",
            "LifeExpect", "MortNeontl", "MortDis", "CauseInjry",
            "GDP",  "TBC", "Diabet", "Physicns", "Beds", "ImmunMeas",
            "TempMarch", "HT", "BCG", "Time")

m3  <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna)
car::vif(m3)
          ### Pretty reasonable VIFs.
          ### Perhaps, either LifeExpect or MortNeontl could still be dropped to decrease VIF,
          ### but let's the future model building steps decide. In any case, both LifeExpect and MortNeontl
          ### should not appear jointly in a final model.


##### Transformation of response?
##### =================================================

   ### Initial models that include all two-way interactions.
   ### For purpose of deciding on possible response transformation,
   ### "safely correct" regression space is useful, possible multicollinearity 
   ### is not an issue (fitted values are not influenced by it)
m4  <- lm(formula(paste("deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna)
m5  <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna)

mffSM::plotLM(m4)      ### Not really nice...
mffSM::plotLM(m5)      ### It could hardly be nicer!

       ### log10(deaths.permil) will be used as outcome


##### Short sidenote
##### ==================

### At some point, we will also consider two-way interactions.
### But not with BCG. The reason is that it is in fact categorical
### and one of categories is pretty small:
table(DataAna[, "BCG"])
xVar3wthBCG  <- xVar3[xVar3 != "BCG"]


##### Model building by an automatic stepwise selection
##### =================================================
help("step", package = "stats")

    ## the simplest possible model
mSmall  <- lm(l10deaths.permil ~ 1, data = DataAna)           
summary(mSmall)

    ## the most complicated model
    ## = all two-way interactions except those with BCG
mBig    <- lm(formula(paste("l10deaths.permil ~ (", paste(xVar3wthBCG, collapse = "+"), ")^2 + BCG")), data = DataAna)
summary(mBig)
      ### -> overfitted but we do not care now, mBig is just description of the most complicated model

    ## initial model (everything additively)
mInit    <- lm(formula(paste("l10deaths.permil ~ ", paste(xVar3, collapse = "+"))), data = DataAna)
summary(mInit)
         ## R2 = 0.65, not that bad

stp  <- step(mInit, scope = list(lower = formula(mSmall), upper = formula(mBig)), direction = "both")
summary(stp)
         ## = final model

    ## But forget about it...
    ## With larger datasets, it usually leads to too complicated models (having a low external validity),
    ## with small datasets, it usually gives too simple models.


##### Non-automatic model building by a sequence of F-tests
##### ======================================================
dim(DataAna)
print(xVar3)

    ### Here, it is not possible to start with all two-way interactions (see above).
    ### Hence first, we will try to select the most important covariates by starting
    ### with the additive model.
    ### --------------------------------------------------------------------------------
m1 <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna)
summary(m1)             ### not that interesting now, only note R2 = 0.653
mffSM::plotLM(m1)       ### just to quickly confirm that the starting model is not completely wrong
car::vif(m1)

    ### What could be removed?
(D1  <- drop1(m1, test = "F"))
(Drop11  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.8])
(Drop12  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.7])   ## = Drop11
(Drop13  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.6])
(Drop14  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.5])
(Drop15  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.4])
(Drop16  <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.3])   ## = Drop15

     ### Let's try to remove all terms being not significant on alpha = 0.80, 0.70, ...
m11  <- update(m1, paste(". ~ . -", paste(Drop11, collapse = "-")))
summary(m11)[["r.squared"]]
anova(m11, m1, test = "F")

#m12  <- update(m1, paste(". ~ . -", paste(Drop12, collapse = "-")))
#summary(m12)[["r.squared"]]
#anova(m12, m1, test = "F")

m13  <- update(m1, paste(". ~ . -", paste(Drop13, collapse = "-")))
summary(m13)[["r.squared"]]
anova(m13, m1, test = "F")

m14  <- update(m1, paste(". ~ . -", paste(Drop14, collapse = "-")))
summary(m14)[["r.squared"]]
anova(m14, m1, test = "F")

  ### --------------
  ### Drop15 = Drop16
m16  <- update(m1, paste(". ~ . -", paste(Drop16, collapse = "-")))
summary(m16)[["r.squared"]]
anova(m16, m1, test = "F")      ## still very high 0.996

drop1(m16, test = "F")
car::vif(m16)                   ## now almost perfect

  ### Try to add terms back to m16
  ### -------------------------------------
print(Drop16)
add1(m16, scope = m1, test = "F")    ## smallest P = 0.300 for MortDis
                                     ## next smallest P = 0.484 for LifeExpect

  ### Put MortDis back
  ### -------------------------------------
m2  <- update(m16, ". ~ . + MortDis")
anova(m2, m1, test = "F")        ## 0.9997

add1(m2, scope = m1, test = "F")    ## smallest P = 0.631 for HT
drop1(m2, test = "F")               ## all P-values <0.32
car::vif(m2)                        ## still almost perfect

summary(m2)             ### not that interesting now, only note R2 = 0.651 (negligible difference as compared to model m1)
mffSM::plotLM(m2)       ### just to quickly confirm that the also this model is not completely wrong


  ### Time to add interactions (not with BCG)
  ### --------------------------------------
xVarAddit  <- c("Popul80", "PopulMid", "Female", "Urban", "MortNeontl", "GDP", "TBC", "Beds", "ImmunMeas", "TempMarch", "MortDis") ## + BCG
m3  <- lm(formula(paste("l10deaths.permil ~ (", paste(xVarAddit, collapse = "+"), ")^2 + BCG")), data = DataAna)

summary(m3)               ## R2 = 0.791
mffSM::plotLM(m3)

anova(m1, m3)
anova(m2, m3)

    ### --------------------------
(D3  <- drop1(m3, test = "F"))

(Drop31  <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.4])
m31  <- update(m3, paste(". ~ . -", paste(Drop31, collapse = "-")))
summary(m31)[["r.squared"]]           ## R2 = 0.734
anova(m31, m3, test = "F")            ## 0.995
add1(m31, scope = m3, test = "F")     ## smallest P = 0.079 --> pretty small, too much was removed

(Drop32  <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.5])
m32  <- update(m3, paste(". ~ . -", paste(Drop32, collapse = "-")))
summary(m32)[["r.squared"]]           ## R2 = 0.752
anova(m32, m3, test = "F")            ## 0.997
add1(m32, scope = m3, test = "F")     ## smallest P = 0.056 --> pretty small, too much was removed

(Drop33  <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.6])
m33  <- update(m3, paste(". ~ . -", paste(Drop33, collapse = "-")))
summary(m33)[["r.squared"]]           ## R2 = 0.777
anova(m33, m3, test = "F")            ## 0.999
add1(m33, scope = m3, test = "F")     ## smallest P = 0.232 --> OK

    ### --------------------------
m4  <- m33

(D4  <- drop1(m4, test = "F"))

(Drop41  <- attr(D4, "row.names")[-1][D4[["Pr(>F)"]][-1] > 0.4])
m41  <- update(m4, paste(". ~ . -", paste(Drop41, collapse = "-")))
summary(m41)[["r.squared"]]           ## R2 = 0.759
anova(m41, m4, test = "F")            ## 0.869
anova(m41, m3)                        ## 0.999
add1(m41, scope = m4, test = "F")     ## smallest P = 0.115 --> pretty small, too much was removed

(Drop42  <- attr(D4, "row.names")[-1][D4[["Pr(>F)"]][-1] > 0.5])
m42  <- update(m4, paste(". ~ . -", paste(Drop42, collapse = "-")))
summary(m42)[["r.squared"]]           ## R2 = 0.774
anova(m42, m4, test = "F")            ## 0.976
anova(m42, m3, test = "F")            ## 1.000
add1(m42, scope = m4, test = "F")     ## smallest P = 0.445 --> OK

    ### --------------------------
m5  <- m42

(D5  <- drop1(m5, test = "F"))

(Drop51  <- attr(D5, "row.names")[-1][D5[["Pr(>F)"]][-1] > 0.4])
m51  <- update(m5, paste(". ~ . -", paste(Drop51, collapse = "-")))
summary(m51)[["r.squared"]]           ## R2 = 0.766
anova(m51, m5, test = "F")            ## 0.506
anova(m51, m3, test = "F")            ## 0.999
add1(m51, scope = m5, test = "F")     ## smallest P = 0.170 --> OK

    ### --------------------------
m6  <- m51

(D6  <- drop1(m6, test = "F"))

(Drop61  <- attr(D6, "row.names")[-1][D6[["Pr(>F)"]][-1] > 0.3])
m61  <- update(m6, paste(". ~ . -", paste(Drop61, collapse = "-")))
summary(m61)[["r.squared"]]           ## R2 = 0.763
anova(m61, m6, test = "F")            ## 0.466
anova(m61, m3, test = "F")            ## 0.999
add1(m61, scope = m6, test = "F")     ## smallest P = 0.380

    ### --------------------------
m7  <- m61

(D7  <- drop1(m7, test = "F"))

(Drop71  <- attr(D7, "row.names")[-1][D7[["Pr(>F)"]][-1] > 0.4])
m71  <- update(m7, paste(". ~ . -", paste(Drop71, collapse = "-")))
summary(m71)[["r.squared"]]           ## R2 = 0.761
anova(m71, m7, test = "F")            ## 0.685
anova(m71, m3, test = "F")            ## 0.999
add1(m71, scope = m7, test = "F")     ## smallest P = 0.602

    ### --------------------------
m8  <- m71

(D8  <- drop1(m8, test = "F"))

(Drop81  <- attr(D8, "row.names")[-1][D8[["Pr(>F)"]][-1] > 0.15])
m81  <- update(m8, paste(". ~ . -", paste(Drop81, collapse = "-")))
summary(m81)[["r.squared"]]           ## R2 = 0.752
anova(m81, m8, test = "F")            ## 0.279
anova(m81, m3, test = "F")            ## 0.999
add1(m81, scope = m8, test = "F")     ## smallest P = 0.226

    ### --------------------------
m9  <- m81

(D9  <- drop1(m9, test = "F"))

(Drop91  <- attr(D9, "row.names")[-1][D9[["Pr(>F)"]][-1] > 0.15])
m91  <- update(m9, paste(". ~ . -", paste(Drop91, collapse = "-")))
summary(m91)[["r.squared"]]           ## R2 = 0.746
anova(m91, m9, test = "F")            ## 0.431
anova(m91, m3, test = "F")            ## 0.999
add1(m91, scope = m9, test = "F")     ## smallest P = 0.312

    ### --------------------------
m10  <- m91

(D10  <- drop1(m10, test = "F"))

(Drop101  <- attr(D10, "row.names")[-1][D10[["Pr(>F)"]][-1] > 0.15])
m101  <- update(m10, paste(". ~ . -", paste(Drop101, collapse = "-")))
summary(m101)[["r.squared"]]           ## R2 = 0.741
anova(m101, m10, test = "F")           ## 0.162
anova(m101, m3, test = "F")            ## 0.999
add1(m101, scope = m10, test = "F")     ## smallest P = 0.162 (now OK)

    ### --------------------------
m11  <- m101

(D11  <- drop1(m11, test = "F"))
                    ## everything significant on roughly 10%


    ### Final model for the moment being
    ### --------------------------------------------------------------------------------
mFin  <- m11

summary(mFin)              ## R2 = 0.741
drop1(mFin, test = "F")

(xFin  <- c(xVarAddit, "BCG"))               ### included in the final model
(xNotFin  <- xVar3[!(xVar3 %in% xFin)])      ### not included in the final model
(xInter  <- c("PopulMid:Urban", "PopulMid:MortDis", "Female:Urban", "Female:TempMarch", "Urban:MortNeontl", "Urban:TBC", "Urban:MortDis",
     "MortNeontl:TBC", "MortNeontl:MortDis", "GDP:TBC", "GDP:MortDis", "Beds:ImmunMeas", "Beds:MortDis"))


    ### mFin with added all original covariates
mFin0  <- lm(formula(paste("l10deaths.permil ~ ", paste(c(xFin, xInter, xNotFin), collapse = "+"))), data = DataAna)
summary(mFin0)[["r.squared"]]     ## R2 = 0.748
#
anova(mFin, mFin0, test = "F")           ## 0.939
add1(mFin, scope = mFin0, test = "F")    ## all original not included covariates not significant,
                                         ## smallest P-value = 0.262 (Diabet)

    ### mFin with added all originally included interactions
mFinAllInt  <- lm(formula(paste("l10deaths.permil ~ (", paste(xVarAddit, collapse = "+"), ")^2 + BCG")), data = DataAna)
summary(mFinAllInt)[["r.squared"]]     ## R2 = 0.791
anova(mFin, mFinAllInt, test = "F")           ## 0.999
add1(mFin, scope = mFinAllInt, test = "F")    ## all not included interactions not significant,
                                              ## smallest P-value = 0.1155 (Popul80:MortNeontl, could perhaps be returned back to the model),
                                              ## the second smallest P-value = 0.1618 (ImmunMeas:TempMarch),
                                              ## the third smallest P-value = 0.1899 (TBC:TempMarch),
                                              ## all other are >0.20


    ### Basic regression diagnostics, more should be done!
    ### --------------------------------------------------------------------------------
mffSM::plotLM(mFin)

shapiro.test(residuals(mFin))
lmtest::bptest(mFin)


    ### Any influential observations?
    ### ------------------------------------
par(mfrow = c(2, 2))
plot(mFin, which = c(1, 4:6))

summary(influence.measures(mFin))
     ### For prediction, DFFITS and Cook distance are the most important
     ### --> no country is influential
     ### Also for all DFBETAS, no country is influential


   ### Comparison of observed mortalities and CROSS-VALIDATED predictions (= \hat{Y}_{[t]}),
   ### Section 11.4.2: \hat{Y}_{[t]} = \hat{Y}_t - U_t*(h_{t,t} / m_{t,t})
   ### ---------------------------------------------------------------------------------------
predCV  <- fitted(mFin) - residuals(mFin) * (hatvalues(mFin) / (1 - hatvalues(mFin)))

sqrt(sum((DataAna[, "l10deaths.permil"] - fitted(mFin))^2)/nrow(DataAna))     ## sqrt(AMSEP)
sqrt(sum((DataAna[, "l10deaths.permil"] - predCV)^2)/nrow(DataAna))           ## sqrt(CV-AMSEP)


    ### Observed vs. predicted (cross-validated) mortalities
    ### ------------------------------------------------------
XLAB  <- "Observed mortality [#deaths/mil. people]"
YLAB0  <- "Predicted mortality [#deaths/mil. people]"
YLAB  <- "Predicted cross-valid. mortality [#deaths/mil. people]"
YVAL  <- c(0, 10, 100, 500)
YAT  <- log10(1 + YVAL)
SHOW  <-  c("IT", "SE", "UK", "ES", "NL", "BE", "FR", "DE", "IR", "CN", "KR", "SG", "JP", "US", "CZ", "SK", "PL", "AT")
indSHOW  <- (1:nrow(DataAna))[DataAna[, "geoId"] %in% SHOW]
#
par(mfrow = c(1, 2), mar = c(4, 4, 1, 1) + 0.1)
#
plot(DataAna[, "l10deaths.permil"], fitted(mFin), pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB0, xaxt = "n", yaxt = "n", main = "Fitted vs. Observed", xlim = c(-0.3, 3), ylim = c(-0.3, 3))
abline(a = 0, b = 1, col = "red3", lwd = 1)
axis(1, at = YAT, labels = YVAL)
axis(2, at = YAT, labels = YVAL)
text(DataAna[, "l10deaths.permil"], fitted(mFin), labels = DataAna[, "geoId"])
text(DataAna[indSHOW, "l10deaths.permil"], fitted(mFin)[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2")  
#
plot(DataAna[, "l10deaths.permil"], predCV, pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB, xaxt = "n", yaxt = "n", main = "CV Predictions vs. Observed", xlim = c(-0.3, 3), ylim = c(-0.3, 3))
abline(a = 0, b = 1, col = "red3", lwd = 1)
axis(1, at = YAT, labels = YVAL)
axis(2, at = YAT, labels = YVAL)
text(DataAna[, "l10deaths.permil"], predCV, labels = DataAna[, "geoId"])
text(DataAna[indSHOW, "l10deaths.permil"], predCV[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2")


    ### Cross-validated 95% prediction intervals added to the plot
    ### -----------------------------------------------------------
pp  <- numeric(nrow(DataAna))
ci  <- matrix(NA, ncol = 2, nrow = nrow(DataAna))
colnames(ci)  <- c("lwr", "upr")
for (i in 1:nrow(DataAna)){
  fiti  <- lm(formula(mFin), data = DataAna[-i,])   
  pp[i]  <- predict(fiti, newdata = DataAna[i,])
  ci[i,]  <- predict(fiti, newdata = DataAna[i,], interval = "prediction")[1, c("lwr", "upr")]
}      
names(pp) <- DataAna[, "geoId"]
rownames(ci) <- DataAna[, "geoId"]

RES  <- 1000
#jpeg(paste(ROOT, "/plotCOVID.jpg", sep = ""), width = 8*RES, height = 6.5*RES, res = RES)
par(mfrow = c(1, 1), mar = c(4, 4, 1, 1) + 0.1)
plot(DataAna[, "l10deaths.permil"], predCV, pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB, xaxt = "n", yaxt = "n", xlim = c(-0.3, 3.2), ylim = c(-0.3, 3.2))
segments(x0 = ci[, "lwr"], x1 = ci[, "upr"], y0 = predCV, lwd = 3, col = "palegoldenrod")          
abline(a = 0, b = 1, col = "red3", lwd = 1)
axis(1, at = YAT, labels = YVAL)
axis(2, at = YAT, labels = YVAL)
text(DataAna[, "l10deaths.permil"], predCV, labels = DataAna[, "geoId"])
text(DataAna[indSHOW, "l10deaths.permil"], predCV[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2")  
text(-0.2, 3, labels = "COVID-19", pos = 4, font = 2, cex = 2.5, col = "red3")
text(-0.2, 2.75, labels = "by 28/05/2020", pos = 4, font = 3, cex = 1.5, col = "red3")
#dev.off()


#####
#####  Given the country specific risk factors,
#####    (1) Was Ing. AB right when he called in May/June the Czech Republic as "best in COVID"?
#####    (2) How did Prof. LD predict (around April/May) tens of thousands of deaths and mass graves?
#####    (3) Was it really justifiable to call in May/June Sweden or Italy as countries that
#####        did not perform well in terms of managing the COVID-19 pandemic?
#####

    ### CV prediction interval for the absolute number of deaths in the Czech Republic (by the end of May)
10^ci["CZ",] * 10

    ### Which risk factors differ between CZ, SE and IT?
subset(DataAna, select = c("geoId", "deaths.permil", "l10deaths.permil", xFin))[c("CZ", "SE", "IT"),]
coef(mFin)