#read data: file <- "C:/Users/krizp/Documents/vyuka MFF/NMFM402 - Matematika NZP 2/Cviceni/CV7/moped.txt" moped <- read.table(file , header = TRUE) head(moped) ##### Data description ################### # class ... 1=Weight over 60kg and more than 2 gears; 2=Other # age ... 1=At most 1 year, 2=2 years or more # zone ... 1=Central and semi-central parts of Sweden's three largest cities, 2=suburbs and middle-sized towns, 3=Lesser towns, except those in 5 or 7, 4=Small towns and countryside, except 5-7, 5=Northern towns, 6=Northern countryside, 7=Gotland (Sweden's largest island)") # duration ... in years # severity ... Claim severity [in SEK] # number ... Number of claims # pure ... Pure premium [in SEK] # actual ... Actual premium [in SEK; The premium for one year according to the tariff in force 1999.] # frequency ... Claim frequency [in /year] = number/duration ########################################### moped <- within(moped, { class <- factor(class) age <- factor(age) zone <- factor(zone) }) summary(moped) rel <- data.frame(rating.factor = c(rep("Vehicle class", nlevels(moped$class)), rep("Vehicle age", nlevels(moped$age)), rep("Zone", nlevels(moped$zone))), class = c(levels(moped$class), levels(moped$age), levels(moped$zone)), stringsAsFactors = FALSE) ## Calculate duration per rating factor level and also set the ## contrasts. We use foreach here to execute the loop both for its ## side-effect (setting the contrasts) and to accumulate the sums. library("foreach") new.cols <- foreach (rating.factor = c("class", "age", "zone"), .combine = rbind) %do% { nclaims <- tapply(moped$number, moped[[rating.factor]], sum) sums <- tapply(moped$duration, moped[[rating.factor]], sum) n.levels <- nlevels(moped[[rating.factor]]) contrasts(moped[[rating.factor]]) <- contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ] data.frame(dur = sums, n.claims = nclaims) } rel <- cbind(rel, new.cols) rm(new.cols) print(rel) ########################################################################### ########################################################################### # Number of claims (Frequency) ########################################################################### ########################################################################### ###exploratory analysis library(ggplot2) ggplot(moped, aes(number, fill = class)) + geom_histogram(binwidth = 1) + facet_grid(class ~ ., margins = TRUE, scales = "free") ggplot(moped, aes(number, fill = age)) + geom_histogram(binwidth = 1) + facet_grid(age ~ ., margins = TRUE, scales = "free") ggplot(moped, aes(frequency, fill = age)) + geom_histogram() + facet_grid(age ~ ., margins = TRUE) ggplot(moped, aes(number, fill = zone)) + geom_histogram(binwidth = 1) + facet_grid(zone ~ ., margins = TRUE, scales = "free") ggplot(moped, aes(frequency, fill = zone)) + geom_histogram() + facet_grid(zone ~ ., margins = TRUE, scales = "free") ######################################### ### modeling:Poisson with log link ### ######################################### summary(model.frequency <- glm(number ~ class + age + zone + offset(log(duration)), data = moped, family = poisson)) # use quasipoisson family to stop glm complaining about nonintegral response summary(glm(number/duration ~ class + age + zone, family=quasipoisson, data=moped, weights=duration)) #chi-square GoF test with(model.frequency, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE))) #test of significance of zone factor model.f2 <- update(model.frequency, . ~ . - zone) #... update model.frequency model dropping zone anova(model.f2, model.frequency, test = "Chisq") #... test model differences with chi square test ### Graphical prediction ### newdata1 <- expand.grid(1:7, factor(1:2), 1:2) colnames(newdata1) <- c("zone", "class", "age") newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1])) colnames(newdata1)[4] <- "duration" newdata1 <- within(newdata1, { age <- factor(age) zone <- factor(zone) }) newdata1$phat <- predict(model.frequency, newdata1) ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted number of claims") ###Prediction of (relative) frequency for each rating factor. rels <- coef( model.frequency ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) rel$rels.frequency <- c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")], c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")]) ######################################### ### modeling: Negative binomial with log link ### ######################################### library(MASS) summary(model.NB <- glm.nb(number ~ class + age + zone + offset(log(duration)), data = moped, control = glm.control(maxit = 25, trace = T))) ##Negative binomial models assume the conditional means are not equal to the conditional variances. ##This inequality is captured by estimating a dispersion parameter (not shown in the output) that is held constant ##in a Poisson model. Thus, the Poisson model is actually nested in the negative binomial model. ##We can then use a likelihood ratio test to compare these two and test this model assumption. ##To do this, we will run our model as a Poisson. (diff2loglik <- as.numeric(2 * (logLik(model.NB) - logLik(model.frequency)))) pchisq(diff2loglik, df = 1, lower.tail = FALSE) #p-value = 0.675 strongly suggests the Poisson model, without the dispersion parameter, is more appropriate than the negative binomial model (blindly ignoring the warning of number of iterations). ########################################################################### ########################################################################### # Claim amount (Severity) ########################################################################### ########################################################################### ggplot(moped, aes(severity)) + geom_histogram() + scale_x_log10() + facet_grid(class ~ age, margins = TRUE, scales = "free_y") ggplot(moped, aes(class, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() ggplot(moped, aes(age, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() ggplot(moped, aes(zone, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() ######################################### ### modeling: Gamma with log link ### ######################################### #There are a couple of points to note here: # 1. We will model using a Gamma distribution for the errors. Note the point that this is only one of several plausible candidate distributions. # 2. Because we are using the Gamma distribution we need to remove the zero values from the data; we do this using the moped[moped$severity > 0, ] construct. # 3. We use the non-canonical log link function even though the canonical function (inverse) gives a slightly better fit (residual deviance 5.9 versus 8.0 on 16 degrees of freedom). The reason is the interpretability. summary(model.severity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("log"), weights = number)) (sev.resid <- as.vector(residuals(model.severity))) ###Some diagnostics### diagnostics <- as.data.frame(cbind(sev.resid,moped$severity[moped$severity > 0])) colnames(diagnostics) <- c("residuals","observed") qplot(observed, residuals, size = I(4), data = diagnostics) qplot(observed, residuals^2, size = I(4), data = diagnostics) res_pearson <- residuals(model.severity, "pearson") qplot(diagnostics$observed, res_pearson, size = I(4), xlab="Y", ylab="Pearson residuals") res_deviance <- residuals(model.severity, "deviance") qplot(diagnostics$observed, res_deviance, size = I(4), xlab="Y", ylab="Deviance residuals") ###Prediction### newdata1 <- expand.grid(1:7, factor(1:2), 1:2) colnames(newdata1) <- c("zone", "class", "age") #newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1])) #colnames(newdata1)[4] <- "duration" newdata1 <- within(newdata1, { age <- factor(age) zone <- factor(zone) }) newdata1$phat <- predict(model.severity, newdata1) ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted claim amount") rels <- coef( model.severity ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) rel$rels.severity <- c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")], c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")]) ###GoF test### with(model.severity, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE))) ###Submodel testing### model.s2 <- update(model.severity, . ~ . - zone) anova(model.s2, model.severity, test = "Chisq") ## simplifued severity model summary(model.s2) ###Other GLM models might be tested#### ########################################################################### ########################################################################### # Premium: Combining the models ########################################################################### ########################################################################### rel$rels.pure.premium <- with(rel, rels.frequency * rels.severity) print(rel, digits = 2)