### ### PURPOSE: Example of model building towards linear model ### in a situation in which the main task is to determine ### which factors (out of a larger group of possible explanatory variables) ### are associated the most with the outcome. ### More or less the same strategy would be recommended if ### the prediction model is the primary aim. ### ### The script below has been motivated by the analyzis ### which results are presented in the manuscript: ### ### Ondřej Hradský and Arnošt Komárek (2020). ### Demographic and public health characteristics explain large part ### of variability in COVID-19 mortality across countries. ### To appear in: European Journal of Public Health. ### ### The final model found here is a bit different from that presented in the paper ### due to the fact that for the paper, the model was built using data that include Singapore. ### Here, Singapore is removed from data before the model building stage for reasons ### mentioned below. Nevertheless, main conclusions based on model found here ### do not considerably differ from those presented in the paper. ### ### OUTCOME VARIABLE: Reported mortality (number of deaths per million of inhabitants) on/with COVID-19 ### per country as of 28 May 2020 (~ after the "first" COVID-19 wave). ### ### EXPLANATORY VARIABLES: various demographic, geographic and public health characteristics of countries ### - see an extra document for their list as well as for the list of analyzed countries ### ### ### PROGRAMMER: Arnošt Komárek ### LOG: created on 22/11/2020 for the purpose of the Linear Regression lectures at MFF UK ### ### ============================================================================================================ rm(list = ls()) ROOT <- "/home/komarek/teach/mff_2020/nmsa407_LinRegr/RkoModelBuild/" setwd(ROOT) ##### Load a "clean" dataset (was already pre-processed) ##### ================================================================ (load(paste(ROOT, "/Data/COVID4mff.RData", sep = ""))) dim(Data) head(Data) summary(Data) ### Numeric (or 0/1) explanatory variables xVar <- c("Male80", "Female80", "Popul80", "Popul65", "PopulMid", "PopulData", "PopulDens", "Female", "Urban", "LifeExpect", "MortNeontl", "MortDis", "CauseInjry", "CauseNonCm", "CauseCommn", "GDP", "TBC", "Diabet", "Obesity", "HIV", "Smoking", "Physicns", "Beds", "ImmunMeas", "TempMarch", "HT.men", "HT.women", "HT", "BCG", "Time") ### Categorical explanatory variables zVar <- c("fBCG", "Haplo") ### fBCG is a factor that coincides with 0/1 BCG 'numeric' variable ### Outcome variables yVar <- c("deaths.permil", "l10deaths.permil") ### l10deaths.permil = log10(1 + deaths.permil) ##### Very basic exploration (much more is needed to get insight to data) ##### Here only steps related to model building are shown ##### ===================================================================== ### For several reasons, it is useful if the numeric explanatory variables have a similar scale ### (std. deviations of a similar magnitude) ### --------------------------------------------------------------------------------------------- sapply(subset(Data, select = xVar), sd, na.rm = TRUE) ### Seems to be OK here -> well, this problem has already been treated in a "cleaning" stage... ### How many missing values do we have on each variable? ### --------------------------------------------------------------------------------------------- sapply(lapply(subset(Data, select = c(yVar, xVar, zVar)), is.na), sum) ### Highest counts of missing values out of 210 (= sample size) ### Haplo: 155 ### Obesity: 102 ### HIV: 72 ### Smoking: 70 ### ---------------------- ### BCG: 50 ### ### Haplo, Obesity, HIV, Smoking clearly problematic (1/3 or more missing). ### Even though they might be associated with the outcome, it is not possible ### to evaluate their effect using this dataset. In the following, ### Haplo, Obesity, HIV, Smoking are no more considered as explanatory variables. (xVar <- xVar[!(xVar %in% c("Haplo", "Obesity", "HIV", "Smoking"))]) zVar <- "fBCG" ### How many coutries (do not) have all variables of interest available? ### ----------------------------------------------------------------------- (nMISS <- apply(as.data.frame(lapply(subset(Data, select = c(yVar, xVar, zVar)), is.na)), 1, sum)) sum(nMISS == 0) ## 139 sum(nMISS > 0) ## 71 ### For model building, we need all considered variables available for all countries. ### Which countries must be dropped from data? table(nMISS) subset(Data, nMISS > 0, select = c("geoId", "CntrName")) ### If those countries not included, not that much is lost. ### Most of them are either very small or with a bit questionable COVID reporting. ### Dataset for the analyzis ### --------------------------------- DataAna <- subset(Data, nMISS == 0) dim(DataAna) apply(as.data.frame(lapply(subset(DataAna, select = c(yVar, xVar, zVar)), is.na)), 1, sum) ### just to check... ##### Marginal associations of covariates to outcome ##### --> more just as exploratory ##### ================================================= ### Categorical covariate par(mfrow = c(1, 2)) plot(deaths.permil ~ fBCG, data = DataAna, col = "chocolate1") plot(l10deaths.permil ~ fBCG, data = DataAna, col = "chocolate1") ### For the following, remember that we have only one categorical variable fBCG ### which is included also among the numeric variables as 0/1 BCG. ### Numeric covariates ### - here only against log10(deaths.permil) which appears later ### as more suitable as outcome rather than deaths.permil itself length(xVar) par(mfrow = c(3, 3)) for (i in 1:9){ FORM <- formula(paste("l10deaths.permil ~", xVar[i])) plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold") abline(lm(FORM, data = DataAna), col = "red2", lwd = 1) } ### Leverages by just one variable? subset(DataAna, PopulDens > 20) ## Singapore subset(DataAna, PopulData > 4) ## China, India subset(DataAna, Female < 45) ## Equatorial_Guinea, Kuwait, Oman, Qatar, Saudi Arabia, United Arab Emirates ### +++++++++++++++++++++++++++++++ par(mfrow = c(3, 3)) for (i in 10:18){ FORM <- formula(paste("l10deaths.permil ~", xVar[i])) plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold") abline(lm(FORM, data = DataAna), col = "red2", lwd = 1) } par(mfrow = c(3, 3)) for (i in 19:27){ FORM <- formula(paste("l10deaths.permil ~", xVar[i])) plot(FORM, data = DataAna, pch = 23, col = "red3", bg = "gold") abline(lm(FORM, data = DataAna), col = "red2", lwd = 1) } ##### Leverages ##### ================================================= m0 <- lm(formula(paste("l10deaths.permil ~", paste(xVar, collapse = "+"))), data = DataAna) hh <- hatvalues(m0) hh[order(hh)] ## --> really extreme for Singapore (h = 0.96) ## --> from previous, we already know why, ## it has quite extreme population density as compared to the rest of data popDens <- DataAna[, "PopulDens"] names(popDens) <- DataAna[, "geoId"] popDens[order(popDens)] ## To avoid possibly dangerous influence of Singapore on results, ## we remove it from the analyzis dataset. Here, an explanation ## might be that we want to restrict our attention on just countries ## with a "reasonable" population density (of at most 13*100 people/km2). DataAna <- subset(DataAna, geoId != "SG") dim(DataAna) ## How about the leverages now? ## (centroid of the covariate values has changed by removal of Singapore) m1 <- lm(formula(paste("l10deaths.permil ~", paste(xVar, collapse = "+"))), data = DataAna) hh1 <- hatvalues(m1) hh1[order(hh1)] ### pretty OK ##### Multicollinearity ##### ================================================= car::vif(m1) summary(m1) ### Hmmm, we forgot that Popul80 = 0.5*(Male80 + Female80) ### HT = 0.5*(HT.men + HT.women) xVar2 <- xVar[!(xVar %in% c("Popul80", "HT"))] m2 <- lm(formula(paste("l10deaths.permil ~", paste(xVar2, collapse = "+"))), data = DataAna) car::vif(m2) summary(car::vif(m2)) ### We seem to have BIG problems (as expected if we think about the variables). ### Pairwise associations to reveal the most profound problems ### ------------------------------------------------------------- psych::pairs.panels(subset(DataAna, select = xVar2), bg = "palegoldenrod", col = "red3", pch = 21, ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE) ### a bit too much... RR <- cor(subset(DataAna, select = xVar2)) print(RR) highR1 <- data.frame(x = character(0), y = character(0), r = numeric(0)) highR2 <- data.frame(x = character(0), y = character(0), r = numeric(0)) highR3 <- data.frame(x = character(0), y = character(0), r = numeric(0)) for (j in 1:(ncol(RR) - 1)){ for (i in (j+1):nrow(RR)){ if (abs(RR[i, j]) > 0.9) highR1 <- rbind(highR1, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j])) if (abs(RR[i, j]) <= 0.9 & abs(RR[i, j]) > 0.8) highR2 <- rbind(highR2, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j])) if (abs(RR[i, j]) <= 0.8 & abs(RR[i, j]) > 0.7) highR3 <- rbind(highR3, data.frame(x = rownames(RR)[i], y = colnames(RR)[j], r = RR[i, j])) } } print(highR1) print(highR2) print(highR3) par(mfrow = c(2, 2)) for (i in 1:nrow(highR1)){ plot(DataAna[, highR1[i, "x"]], DataAna[, highR1[i, "y"]], xlab = highR1[i, "x"], ylab = highR1[i, "y"], pch = 23, col = "red3", bg = "gold") title(main = paste("r = ", format(round(highR1[i, "r"], 2), nsmall = 2))) } ### Actions: ### (Male80, Female80, Popul65) --> Popul80 = 0.5*(Male80 + Female80) ### which represents age of the population ### ### cor(CauseCommn, CauseNonCm) = -0.98 ### at the same time, both CauseCommn and CauseNonCm highly correlated with MortNeontl and LifeExpect ### -> drop both CauseCommn and CauseNonCm (they have somehow inclear interpretation anyway) ### -> keep MortNeontl and LifeExpect even though they are also quite correlated (-0.867) ### ### (HT.women, HT.men) --> HT = 0.5*(HT.women + HT.men) ### ### Do not care about cor(Physicns, Female80) = 0.81 now, also do not care about other <0.80 correlations. xVar3 <- c("Popul80", "PopulMid", "PopulData", "PopulDens", "Female", "Urban", "LifeExpect", "MortNeontl", "MortDis", "CauseInjry", "GDP", "TBC", "Diabet", "Physicns", "Beds", "ImmunMeas", "TempMarch", "HT", "BCG", "Time") m3 <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna) car::vif(m3) ### Pretty reasonable VIFs. ### Perhaps, either LifeExpect or MortNeontl could still be dropped to decrease VIF, ### but let's the future model building steps decide. In any case, both LifeExpect and MortNeontl ### should not appear jointly in a final model. ##### Transformation of response? ##### ================================================= ### Initial models that include all two-way interactions. ### For purpose of deciding on possible response transformation, ### "safely correct" regression space is useful, possible multicollinearity ### is not an issue (fitted values are not influenced by it) m4 <- lm(formula(paste("deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna) m5 <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna) mffSM::plotLM(m4) ### Not really nice... mffSM::plotLM(m5) ### It could hardly be nicer! ### log10(deaths.permil) will be used as outcome ##### Short sidenote ##### ================== ### At some point, we will also consider two-way interactions. ### But not with BCG. The reason is that it is in fact categorical ### and one of categories is pretty small: table(DataAna[, "BCG"]) xVar3wthBCG <- xVar3[xVar3 != "BCG"] ##### Model building by an automatic stepwise selection ##### ================================================= help("step", package = "stats") ## the simplest possible model mSmall <- lm(l10deaths.permil ~ 1, data = DataAna) summary(mSmall) ## the most complicated model ## = all two-way interactions except those with BCG mBig <- lm(formula(paste("l10deaths.permil ~ (", paste(xVar3wthBCG, collapse = "+"), ")^2 + BCG")), data = DataAna) summary(mBig) ### -> overfitted but we do not care now, mBig is just description of the most complicated model ## initial model (everything additively) mInit <- lm(formula(paste("l10deaths.permil ~ ", paste(xVar3, collapse = "+"))), data = DataAna) summary(mInit) ## R2 = 0.65, not that bad stp <- step(mInit, scope = list(lower = formula(mSmall), upper = formula(mBig)), direction = "both") summary(stp) ## = final model ## But forget about it... ## With larger datasets, it usually leads to too complicated models (having a low external validity), ## with small datasets, it usually gives too simple models. ##### Non-automatic model building by a sequence of F-tests ##### ====================================================== dim(DataAna) print(xVar3) ### Here, it is not possible to start with all two-way interactions (see above). ### Hence first, we will try to select the most important covariates by starting ### with the additive model. ### -------------------------------------------------------------------------------- m1 <- lm(formula(paste("l10deaths.permil ~", paste(xVar3, collapse = "+"))), data = DataAna) summary(m1) ### not that interesting now, only note R2 = 0.653 mffSM::plotLM(m1) ### just to quickly confirm that the starting model is not completely wrong car::vif(m1) ### What could be removed? (D1 <- drop1(m1, test = "F")) (Drop11 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.8]) (Drop12 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.7]) ## = Drop11 (Drop13 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.6]) (Drop14 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.5]) (Drop15 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.4]) (Drop16 <- attr(D1, "row.names")[-1][D1[["Pr(>F)"]][-1] > 0.3]) ## = Drop15 ### Let's try to remove all terms being not significant on alpha = 0.80, 0.70, ... m11 <- update(m1, paste(". ~ . -", paste(Drop11, collapse = "-"))) summary(m11)[["r.squared"]] anova(m11, m1, test = "F") #m12 <- update(m1, paste(". ~ . -", paste(Drop12, collapse = "-"))) #summary(m12)[["r.squared"]] #anova(m12, m1, test = "F") m13 <- update(m1, paste(". ~ . -", paste(Drop13, collapse = "-"))) summary(m13)[["r.squared"]] anova(m13, m1, test = "F") m14 <- update(m1, paste(". ~ . -", paste(Drop14, collapse = "-"))) summary(m14)[["r.squared"]] anova(m14, m1, test = "F") ### -------------- ### Drop15 = Drop16 m16 <- update(m1, paste(". ~ . -", paste(Drop16, collapse = "-"))) summary(m16)[["r.squared"]] anova(m16, m1, test = "F") ## still very high 0.996 drop1(m16, test = "F") car::vif(m16) ## now almost perfect ### Try to add terms back to m16 ### ------------------------------------- print(Drop16) add1(m16, scope = m1, test = "F") ## smallest P = 0.300 for MortDis ## next smallest P = 0.484 for LifeExpect ### Put MortDis back ### ------------------------------------- m2 <- update(m16, ". ~ . + MortDis") anova(m2, m1, test = "F") ## 0.9997 add1(m2, scope = m1, test = "F") ## smallest P = 0.631 for HT drop1(m2, test = "F") ## all P-values <0.32 car::vif(m2) ## still almost perfect summary(m2) ### not that interesting now, only note R2 = 0.651 (negligible difference as compared to model m1) mffSM::plotLM(m2) ### just to quickly confirm that the also this model is not completely wrong ### Time to add interactions (not with BCG) ### -------------------------------------- xVarAddit <- c("Popul80", "PopulMid", "Female", "Urban", "MortNeontl", "GDP", "TBC", "Beds", "ImmunMeas", "TempMarch", "MortDis") ## + BCG m3 <- lm(formula(paste("l10deaths.permil ~ (", paste(xVarAddit, collapse = "+"), ")^2 + BCG")), data = DataAna) summary(m3) ## R2 = 0.791 mffSM::plotLM(m3) anova(m1, m3) anova(m2, m3) ### -------------------------- (D3 <- drop1(m3, test = "F")) (Drop31 <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.4]) m31 <- update(m3, paste(". ~ . -", paste(Drop31, collapse = "-"))) summary(m31)[["r.squared"]] ## R2 = 0.734 anova(m31, m3, test = "F") ## 0.995 add1(m31, scope = m3, test = "F") ## smallest P = 0.079 --> pretty small, too much was removed (Drop32 <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.5]) m32 <- update(m3, paste(". ~ . -", paste(Drop32, collapse = "-"))) summary(m32)[["r.squared"]] ## R2 = 0.752 anova(m32, m3, test = "F") ## 0.997 add1(m32, scope = m3, test = "F") ## smallest P = 0.056 --> pretty small, too much was removed (Drop33 <- attr(D3, "row.names")[-1][D3[["Pr(>F)"]][-1] > 0.6]) m33 <- update(m3, paste(". ~ . -", paste(Drop33, collapse = "-"))) summary(m33)[["r.squared"]] ## R2 = 0.777 anova(m33, m3, test = "F") ## 0.999 add1(m33, scope = m3, test = "F") ## smallest P = 0.232 --> OK ### -------------------------- m4 <- m33 (D4 <- drop1(m4, test = "F")) (Drop41 <- attr(D4, "row.names")[-1][D4[["Pr(>F)"]][-1] > 0.4]) m41 <- update(m4, paste(". ~ . -", paste(Drop41, collapse = "-"))) summary(m41)[["r.squared"]] ## R2 = 0.759 anova(m41, m4, test = "F") ## 0.869 anova(m41, m3) ## 0.999 add1(m41, scope = m4, test = "F") ## smallest P = 0.115 --> pretty small, too much was removed (Drop42 <- attr(D4, "row.names")[-1][D4[["Pr(>F)"]][-1] > 0.5]) m42 <- update(m4, paste(". ~ . -", paste(Drop42, collapse = "-"))) summary(m42)[["r.squared"]] ## R2 = 0.774 anova(m42, m4, test = "F") ## 0.976 anova(m42, m3, test = "F") ## 1.000 add1(m42, scope = m4, test = "F") ## smallest P = 0.445 --> OK ### -------------------------- m5 <- m42 (D5 <- drop1(m5, test = "F")) (Drop51 <- attr(D5, "row.names")[-1][D5[["Pr(>F)"]][-1] > 0.4]) m51 <- update(m5, paste(". ~ . -", paste(Drop51, collapse = "-"))) summary(m51)[["r.squared"]] ## R2 = 0.766 anova(m51, m5, test = "F") ## 0.506 anova(m51, m3, test = "F") ## 0.999 add1(m51, scope = m5, test = "F") ## smallest P = 0.170 --> OK ### -------------------------- m6 <- m51 (D6 <- drop1(m6, test = "F")) (Drop61 <- attr(D6, "row.names")[-1][D6[["Pr(>F)"]][-1] > 0.3]) m61 <- update(m6, paste(". ~ . -", paste(Drop61, collapse = "-"))) summary(m61)[["r.squared"]] ## R2 = 0.763 anova(m61, m6, test = "F") ## 0.466 anova(m61, m3, test = "F") ## 0.999 add1(m61, scope = m6, test = "F") ## smallest P = 0.380 ### -------------------------- m7 <- m61 (D7 <- drop1(m7, test = "F")) (Drop71 <- attr(D7, "row.names")[-1][D7[["Pr(>F)"]][-1] > 0.4]) m71 <- update(m7, paste(". ~ . -", paste(Drop71, collapse = "-"))) summary(m71)[["r.squared"]] ## R2 = 0.761 anova(m71, m7, test = "F") ## 0.685 anova(m71, m3, test = "F") ## 0.999 add1(m71, scope = m7, test = "F") ## smallest P = 0.602 ### -------------------------- m8 <- m71 (D8 <- drop1(m8, test = "F")) (Drop81 <- attr(D8, "row.names")[-1][D8[["Pr(>F)"]][-1] > 0.15]) m81 <- update(m8, paste(". ~ . -", paste(Drop81, collapse = "-"))) summary(m81)[["r.squared"]] ## R2 = 0.752 anova(m81, m8, test = "F") ## 0.279 anova(m81, m3, test = "F") ## 0.999 add1(m81, scope = m8, test = "F") ## smallest P = 0.226 ### -------------------------- m9 <- m81 (D9 <- drop1(m9, test = "F")) (Drop91 <- attr(D9, "row.names")[-1][D9[["Pr(>F)"]][-1] > 0.15]) m91 <- update(m9, paste(". ~ . -", paste(Drop91, collapse = "-"))) summary(m91)[["r.squared"]] ## R2 = 0.746 anova(m91, m9, test = "F") ## 0.431 anova(m91, m3, test = "F") ## 0.999 add1(m91, scope = m9, test = "F") ## smallest P = 0.312 ### -------------------------- m10 <- m91 (D10 <- drop1(m10, test = "F")) (Drop101 <- attr(D10, "row.names")[-1][D10[["Pr(>F)"]][-1] > 0.15]) m101 <- update(m10, paste(". ~ . -", paste(Drop101, collapse = "-"))) summary(m101)[["r.squared"]] ## R2 = 0.741 anova(m101, m10, test = "F") ## 0.162 anova(m101, m3, test = "F") ## 0.999 add1(m101, scope = m10, test = "F") ## smallest P = 0.162 (now OK) ### -------------------------- m11 <- m101 (D11 <- drop1(m11, test = "F")) ## everything significant on roughly 10% ### Final model for the moment being ### -------------------------------------------------------------------------------- mFin <- m11 summary(mFin) ## R2 = 0.741 drop1(mFin, test = "F") (xFin <- c(xVarAddit, "BCG")) ### included in the final model (xNotFin <- xVar3[!(xVar3 %in% xFin)]) ### not included in the final model (xInter <- c("PopulMid:Urban", "PopulMid:MortDis", "Female:Urban", "Female:TempMarch", "Urban:MortNeontl", "Urban:TBC", "Urban:MortDis", "MortNeontl:TBC", "MortNeontl:MortDis", "GDP:TBC", "GDP:MortDis", "Beds:ImmunMeas", "Beds:MortDis")) ### mFin with added all original covariates mFin0 <- lm(formula(paste("l10deaths.permil ~ ", paste(c(xFin, xInter, xNotFin), collapse = "+"))), data = DataAna) summary(mFin0)[["r.squared"]] ## R2 = 0.748 # anova(mFin, mFin0, test = "F") ## 0.939 add1(mFin, scope = mFin0, test = "F") ## all original not included covariates not significant, ## smallest P-value = 0.262 (Diabet) ### mFin with added all originally included interactions mFinAllInt <- lm(formula(paste("l10deaths.permil ~ (", paste(xVarAddit, collapse = "+"), ")^2 + BCG")), data = DataAna) summary(mFinAllInt)[["r.squared"]] ## R2 = 0.791 anova(mFin, mFinAllInt, test = "F") ## 0.999 add1(mFin, scope = mFinAllInt, test = "F") ## all not included interactions not significant, ## smallest P-value = 0.1155 (Popul80:MortNeontl, could perhaps be returned back to the model), ## the second smallest P-value = 0.1618 (ImmunMeas:TempMarch), ## the third smallest P-value = 0.1899 (TBC:TempMarch), ## all other are >0.20 ### Basic regression diagnostics, more should be done! ### -------------------------------------------------------------------------------- mffSM::plotLM(mFin) shapiro.test(residuals(mFin)) lmtest::bptest(mFin) ### Any influential observations? ### ------------------------------------ par(mfrow = c(2, 2)) plot(mFin, which = c(1, 4:6)) summary(influence.measures(mFin)) ### For prediction, DFFITS and Cook distance are the most important ### --> no country is influential ### Also for all DFBETAS, no country is influential ### Comparison of observed mortalities and CROSS-VALIDATED predictions (= \hat{Y}_{[t]}), ### Section 11.4.2: \hat{Y}_{[t]} = \hat{Y}_t - U_t*(h_{t,t} / m_{t,t}) ### --------------------------------------------------------------------------------------- predCV <- fitted(mFin) - residuals(mFin) * (hatvalues(mFin) / (1 - hatvalues(mFin))) sqrt(sum((DataAna[, "l10deaths.permil"] - fitted(mFin))^2)/nrow(DataAna)) ## sqrt(AMSEP) sqrt(sum((DataAna[, "l10deaths.permil"] - predCV)^2)/nrow(DataAna)) ## sqrt(CV-AMSEP) ### Observed vs. predicted (cross-validated) mortalities ### ------------------------------------------------------ XLAB <- "Observed mortality [#deaths/mil. people]" YLAB0 <- "Predicted mortality [#deaths/mil. people]" YLAB <- "Predicted cross-valid. mortality [#deaths/mil. people]" YVAL <- c(0, 10, 100, 500) YAT <- log10(1 + YVAL) SHOW <- c("IT", "SE", "UK", "ES", "NL", "BE", "FR", "DE", "IR", "CN", "KR", "SG", "JP", "US", "CZ", "SK", "PL", "AT") indSHOW <- (1:nrow(DataAna))[DataAna[, "geoId"] %in% SHOW] # par(mfrow = c(1, 2), mar = c(4, 4, 1, 1) + 0.1) # plot(DataAna[, "l10deaths.permil"], fitted(mFin), pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB0, xaxt = "n", yaxt = "n", main = "Fitted vs. Observed", xlim = c(-0.3, 3), ylim = c(-0.3, 3)) abline(a = 0, b = 1, col = "red3", lwd = 1) axis(1, at = YAT, labels = YVAL) axis(2, at = YAT, labels = YVAL) text(DataAna[, "l10deaths.permil"], fitted(mFin), labels = DataAna[, "geoId"]) text(DataAna[indSHOW, "l10deaths.permil"], fitted(mFin)[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2") # plot(DataAna[, "l10deaths.permil"], predCV, pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB, xaxt = "n", yaxt = "n", main = "CV Predictions vs. Observed", xlim = c(-0.3, 3), ylim = c(-0.3, 3)) abline(a = 0, b = 1, col = "red3", lwd = 1) axis(1, at = YAT, labels = YVAL) axis(2, at = YAT, labels = YVAL) text(DataAna[, "l10deaths.permil"], predCV, labels = DataAna[, "geoId"]) text(DataAna[indSHOW, "l10deaths.permil"], predCV[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2") ### Cross-validated 95% prediction intervals added to the plot ### ----------------------------------------------------------- pp <- numeric(nrow(DataAna)) ci <- matrix(NA, ncol = 2, nrow = nrow(DataAna)) colnames(ci) <- c("lwr", "upr") for (i in 1:nrow(DataAna)){ fiti <- lm(formula(mFin), data = DataAna[-i,]) pp[i] <- predict(fiti, newdata = DataAna[i,]) ci[i,] <- predict(fiti, newdata = DataAna[i,], interval = "prediction")[1, c("lwr", "upr")] } names(pp) <- DataAna[, "geoId"] rownames(ci) <- DataAna[, "geoId"] RES <- 1000 #jpeg(paste(ROOT, "/plotCOVID.jpg", sep = ""), width = 8*RES, height = 6.5*RES, res = RES) par(mfrow = c(1, 1), mar = c(4, 4, 1, 1) + 0.1) plot(DataAna[, "l10deaths.permil"], predCV, pch = 23, col = "grey80", bg = "palegoldenrod", xlab = XLAB, ylab = YLAB, xaxt = "n", yaxt = "n", xlim = c(-0.3, 3.2), ylim = c(-0.3, 3.2)) segments(x0 = ci[, "lwr"], x1 = ci[, "upr"], y0 = predCV, lwd = 3, col = "palegoldenrod") abline(a = 0, b = 1, col = "red3", lwd = 1) axis(1, at = YAT, labels = YVAL) axis(2, at = YAT, labels = YVAL) text(DataAna[, "l10deaths.permil"], predCV, labels = DataAna[, "geoId"]) text(DataAna[indSHOW, "l10deaths.permil"], predCV[indSHOW], labels = DataAna[indSHOW, "geoId"], font = 2, col = "red2") text(-0.2, 3, labels = "COVID-19", pos = 4, font = 2, cex = 2.5, col = "red3") text(-0.2, 2.75, labels = "by 28/05/2020", pos = 4, font = 3, cex = 1.5, col = "red3") #dev.off() ##### ##### Given the country specific risk factors, ##### (1) Was Ing. AB right when he called in May/June the Czech Republic as "best in COVID"? ##### (2) How did Prof. LD predict (around April/May) tens of thousands of deaths and mass graves? ##### (3) Was it really justifiable to call in May/June Sweden or Italy as countries that ##### did not perform well in terms of managing the COVID-19 pandemic? ##### ### CV prediction interval for the absolute number of deaths in the Czech Republic (by the end of May) 10^ci["CZ",] * 10 ### Which risk factors differ between CZ, SE and IT? subset(DataAna, select = c("geoId", "deaths.permil", "l10deaths.permil", xFin))[c("CZ", "SE", "IT"),] coef(mFin)