#Data library(readr) Life_Expectancy_Data <- read_csv("~/Documents/Life Expectancy Data.csv") library(tidyverse) library(sjPlot) WHO_2010 <- Life_Expectancy_Data %>% filter(Year == 2010) %>% filter(BMI>15) %>% drop_na() #correlation cor(WHO_2010$`Life expectancy`, WHO_2010$BMI, method="pearson") cor(WHO_2010$`Life expectancy`, WHO_2010$BMI, method="spearman") cor(WHO_2010$`Life expectancy`, WHO_2010$BMI, method="kendall") #simple regression WHO_2010 %>% ggplot(aes(y=`Life expectancy`, x=BMI))+ geom_point() WHO_2010 %>% ggplot(aes(y=`Life expectancy`, x=BMI))+ geom_point()+ geom_smooth(method="lm", se=T) mod1 <- lm(`Life expectancy`~`BMI`, data=WHO_2010) summary(mod1) #distribution WHO_2010 %>% ggplot(aes(x=`Life expectancy`))+ geom_histogram(binwidth = 3) library(e1071) #raw values skewness(WHO_2010$`Life expectancy`) #z-scores (The z score tells you how many standard deviations from the mean your score is.) skewness(scale(WHO_2010$`Life expectancy`)) #log transformation skewness(log(WHO_2010$`Life expectancy`)) #sq skewness(WHO_2010$`Life expectancy`^2) #Shapiro-Wilk normality test, p>0,5 = normal dist. shapiro.test(WHO_2010$`Life expectancy`) shapiro.test(scale(WHO_2010$`Life expectancy`)) shapiro.test(log(WHO_2010$`Life expectancy`)) shapiro.test(WHO_2010$`Life expectancy`^2) WHO_2010 %>% ggplot(aes(x=log(`Life expectancy`)))+ geom_histogram(binwidth = 0.06) WHO_2010 %>% ggplot(aes(x=scale(`Life expectancy`)))+ geom_histogram(binwidth = 0.7) WHO_2010 %>% ggplot(aes(x=`Life expectancy`^2))+ geom_histogram(binwidth = 900) WHO_2010 %>% ggplot(aes(x=BMI))+ geom_histogram() shapiro.test(WHO_2010$BMI) shapiro.test(scale(WHO_2010$BMI)) shapiro.test(log(WHO_2010$BMI)) shapiro.test(WHO_2010$BMI^2) transformation <- abs(WHO_2010$BMI - mean(WHO_2010$BMI, na.rm=T)) shapiro.test(transformation) WHO_2010 %>% ggplot(aes(x=transformation))+ geom_histogram() WHO_2010 %>% ggplot(aes(x=log(BMI)))+ geom_histogram() WHO_2010 %>% ggplot(aes(x=scale(BMI)))+ geom_histogram() WHO_2010 %>% ggplot(aes(x=BMI^2))+ geom_histogram() WHO_2010 %>% ggplot(aes(y=`Life expectancy`, x=BMI))+ geom_point()+ geom_smooth(method = "lm") WHO_2010 %>% ggplot(aes(y=`Life expectancy`^2, x=BMI))+ geom_point()+ geom_smooth(method = "lm") WHO_2010 %>% ggplot(aes(y=`Life expectancy`^2, x=transformation))+ geom_point()+ geom_smooth(method = "lm") plot_model(mod1, type="diag") mod2 <- lm(`Life expectancy`^2~BMI, data=WHO_2010) summary(mod2) plot_model(mod2, type="diag") #outliers library(broom) tidy(mod2) mod2_fit <- augment(mod2) mod3 <- lm(`Life expectancy`~BMI+Status, data=WHO_2010) summary(mod3) plot_model(mod3, type="diag") plot_model(mod3) #interaction terms mod4 <- lm(`Life expectancy`~BMI:Status, data=WHO_2010) summary(mod4) plot_model(mod4, type="diag") plot_model(mod4, type="int") plot_model(mod4) #save transformations #WHO_2010 <- WHO_2010 %>% # mutate(logGDP=log(GDP), # logInfant=log(1+`infant deaths`), # transformedBMI=abs(WHO_2010$BMI - mean(WHO_2010$BMI, na.rm=T)), # cubeAlcohol = WHO_2010$Alcohol^(1/3), # logHIV = log(`HIV/AIDS`), # sqLife = `Life expectancy`^2) #hist(WHO_2010$logGDP) #hist(WHO_2010$logInfant) #mod5 <- lm(sqLife~transformedBMI+Status+`Adult Mortality`+cubeAlcohol+logGDP+logInfant+logHIV, data=WHO_2010) #summary(mod5) #plot_model(mod5, type="diag") #plot_model(mod5, type = "pred") #plot_model(mod5, type="int") WHO_2010 %>% ggplot(aes(x=log(Alcohol)))+ geom_histogram() WHO_2010 %>% ggplot(aes(x=Alcohol))+ geom_histogram() shapiro.test(WHO_2010$Alcohol) hist(WHO_2010$Alcohol) shapiro.test(WHO_2010$Alcohol^(1/3)) hist(WHO_2010$Alcohol^(1/3)) shapiro.test(log(WHO_2010$Alcohol)) hist(log(WHO_2010$Alcohol)) shapiro.test(sqrt(WHO_2010$Alcohol)) hist(sqrt(WHO_2010$Alcohol)) #model development reg_WHO_2010 <- WHO_2010 %>% select(-Country, -Year) mod_complete <- lm(`Life expectancy`~. , data = reg_WHO_2010) summary(mod_complete) plot_model(mod_complete, type="diag") plot_model(mod_complete, type = "pred") reg_WHO_2010 <- reg_WHO_2010 %>% select(-`under-five deaths`) mod_complete <- lm(`Life expectancy`~. , data = reg_WHO_2010) summary(mod_complete) plot_model(mod_complete, type="diag") plot_model(mod_complete, type = "pred") reg_WHO_2010 <- reg_WHO_2010 %>% select(-`percentage expenditure`) mod_complete <- lm(`Life expectancy`~. , data = reg_WHO_2010) summary(mod_complete) plot_model(mod_complete, type="diag") plot_model(mod_complete, type = "pred") reg_WHO_2010 <- reg_WHO_2010 %>% select(-`thinness 5-9 years`) mod_complete <- lm(`Life expectancy`~. , data = reg_WHO_2010) summary(mod_complete) plot_model(mod_complete, type="diag") plot_model(mod_complete, type = "pred") #heteroskedasticity library(lmtest) library(sandwich) coeftest(mod_complete, vcov = vcovHC(mod_complete, type="HC1")) coeftest(mod_complete, vcov = vcovHC(mod_complete, type="HC3")) library(sjmisc) descr(reg_WHO_2010) reg_WHO_2010 %>% select(-Status) %>% sjp.corr() WHO_2010 %>% ggplot(aes(x=`Alcohol`, y=`Adult Mortality`))+ geom_point()+ geom_smooth(method = "lm") WHO_2010 %>% ggplot(aes(x=`Alcohol`, y=`Schooling`))+ geom_point()+ geom_smooth(method = "lm") mod6 <- lm(`Life expectancy`~ Alcohol*Schooling , data = reg_WHO_2010) summary(mod6) plot_model(mod6, type="diag") plot_model(mod6, type = "pred") plot_model(mod6, type="int") mod7 <- lm(`Life expectancy`~ Alcohol*Schooling+BMI, data = reg_WHO_2010) summary(mod7) plot_model(mod7, type="diag") plot_model(mod7, type = "pred") plot_model(mod7, type="int") library(jtools) summ(mod_complete, robust="HC1") export_summs(mod_complete, model.names = c("Complete"), robust="HC1") export_summs(mod_complete, model.names = c("Complete"), robust="HC3", scale = T, digits = 5) export_summs(mod_complete, mod6, model.names = c("Complete", "Model 6"), robust="HC3", digits = 2, to.file = "html", file.name = "tabulka_model.html") effect_plot(mod_complete, pred = Status, interval = T) ####### Fixed-effects models Life_Expectancy_Data %>% ggplot(aes(y=`Life expectancy`, x=BMI))+ geom_point()+ geom_smooth(method = "lm") Life_Expectancy_Data %>% filter(Country %in% c("Albania", "Austria", "Belgium", "Nigeria", "Chad", "Congo", "Germany")) %>% ggplot(aes(y=`Life expectancy`, x=BMI, col=Country))+ geom_point()+ geom_smooth(method = "lm") Life_Expectancy_Data %>% filter(Country %in% c("Albania", "Austria", "Belgium", "Nigeria", "Chad", "Congo", "Germany")) %>% filter(BMI>10) %>% ggplot(aes(y=`Life expectancy`, x=BMI, col=Country))+ geom_point()+ geom_smooth(method = "lm") Life_Expectancy_Data %>% filter(Country %in% c("Albania", "Austria", "Belgium", "Nigeria", "Chad", "Congo", "Germany")) %>% filter(BMI>10) %>% ggplot(aes(y=`Life expectancy`, x=BMI, col=Country, label=Year))+ geom_point()+ geom_smooth(method = "lm")+ geom_text() library(ggrepel) Life_Expectancy_Data %>% filter(Country %in% c("Albania", "Austria", "Belgium", "Nigeria", "Chad", "Congo", "Germany")) %>% filter(BMI>10) %>% ggplot(aes(y=`Life expectancy`, x=BMI, col=Country, label=Year))+ geom_point()+ geom_smooth(method = "lm")+ #theme(legend.position="none")+ geom_text_repel() Life_Expectancy_Data %>% #filter(Country %in% c("Albania", "Austria", "Belgium", "Nigeria", "Chad", "Congo", "Germany")) %>% filter(BMI>10) %>% ggplot(aes(y=`Life expectancy`, x=BMI, col=Country, label=Year))+ geom_point()+ geom_smooth(method = "lm")+ theme(legend.position="none") #geom_text_repel() library(plm) Life_Expectancy_Data <- Life_Expectancy_Data %>% mutate(LifeExp = `Life expectancy`) pool.mod <- lm(LifeExp ~ BMI, data=Life_Expectancy_Data) summary(pool.mod) mod2.1 <- lm(LifeExp ~ BMI + factor(Country) -1, data=Life_Expectancy_Data) summary(mod2.1) # BMI 0.0183675 0.0041064 4.4729 8.032e-06 *** p.mod2 <- plm(LifeExp ~ BMI, data=Life_Expectancy_Data, index=c("Country", "Year"), model="within") summary(p.mod2) fixef(p.mod2) pFtest(p.mod2, pool.mod) p.mod2.time <- plm(LifeExp ~ BMI + factor(Year), data=Life_Expectancy_Data, index=c("Country", "Year"), model="within") summary(p.mod2.time) p.mod2.time2 <- plm(LifeExp ~ BMI, data=Life_Expectancy_Data, index=c("Country", "Year"), model="within", effect="twoway") summary(p.mod2.time2) pFtest(p.mod2.time, p.mod2) #p-value < 0.05 = je lepší přidat time fixed-effects plmtest(p.mod2, c("time"), type=("bp")) #p-value < 0.05 = fixed-effects model je lepsi nez ols p.mod3 <- plm(LifeExp ~ BMI, data=Life_Expectancy_Data, index=c("Country", "Year"), model = "random") summary(p.mod3) phtest(p.mod2, p.mod3) #p-value < 0.05 = fixed-effects model je lepsi nez random p.mod4 <- plm(LifeExp ~ BMI, data=Life_Expectancy_Data, index=c("Country", "Year"), model = "pooling") summary(p.mod4) plmtest(p.mod4, type=("bp")) #p-value < 0.05 = random model je lepsi nez OLS ##### logit_data <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv") logit_data %>% ggplot(aes(y=admit, x=gre))+ geom_point() log.1 <- glm(admit ~ gre + gpa +factor(rank), data=logit_data, family = "binomial") summary(log.1) exp(coef(log.1)) plot_model(log.1) plot_model(log.1, type = "pred") export_summs(log.1, exp = F) ordered(logit_data$rank) log.2 <- glm(admit ~ gre + gpa + ordered(rank), data=logit_data, family = "binomial") summary(log.2) exp(coef(log.2)) plot_model(log.2) plot_model(log.2, type = "pred") export_summs(log.2, exp = TRUE)