#read data:
file <- "C:/Users/krizp/Documents/vyuka MFF/NMFM402 - Matematika NZP 2/Cviceni/CV7/moped.txt"
moped <- read.table(file , header = TRUE)
head(moped)

##### Data description ###################

#    class ... 1=Weight over 60kg and more than 2 gears; 2=Other
#    age ... 1=At most 1 year, 2=2 years or more
#    zone ... 1=Central and semi-central parts of Sweden's three largest cities, 2=suburbs and middle-sized towns, 3=Lesser towns, except those in 5 or 7, 4=Small towns and countryside, except 5-7, 5=Northern towns, 6=Northern countryside, 7=Gotland (Sweden's largest island)")
#    duration ... in years
#    severity ... Claim severity [in SEK]
#    number ... Number of claims
#    pure ... Pure premium [in SEK]
#    actual ... Actual premium [in SEK; The premium for one year according to the tariff in force 1999.]
#    frequency ... Claim frequency [in /year]  = number/duration

###########################################

moped <- within(moped, {
    class <- factor(class)
    age <- factor(age)
    zone <- factor(zone)
})
summary(moped)


rel <-
    data.frame(rating.factor =
               c(rep("Vehicle class", nlevels(moped$class)),
                 rep("Vehicle age",   nlevels(moped$age)),
                 rep("Zone",          nlevels(moped$zone))),
               class =
               c(levels(moped$class),
                 levels(moped$age),
                 levels(moped$zone)),
               stringsAsFactors = FALSE)


## Calculate duration per rating factor level and also set the
## contrasts. We use foreach here to execute the loop both for its
## side-effect (setting the contrasts) and to accumulate the sums.
library("foreach")
new.cols <- 
  foreach (rating.factor = c("class", "age", "zone"),
           .combine = rbind) %do%
{
    nclaims <- tapply(moped$number, moped[[rating.factor]], sum)
    sums <- tapply(moped$duration, moped[[rating.factor]], sum)
    n.levels <- nlevels(moped[[rating.factor]])
    contrasts(moped[[rating.factor]]) <-
        contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ]
    data.frame(dur = sums, n.claims = nclaims)
}
rel <- cbind(rel, new.cols)
rm(new.cols)
print(rel)


###########################################################################
###########################################################################
#			Number of claims (Frequency)
###########################################################################
###########################################################################

###exploratory analysis
library(ggplot2)

ggplot(moped, aes(number, fill = class)) + 
	geom_histogram(binwidth = 1) + 
	facet_grid(class ~ ., margins = TRUE, scales = "free")

ggplot(moped, aes(number, fill = age)) + 
	geom_histogram(binwidth = 1) + 
	facet_grid(age ~ ., margins = TRUE, scales = "free")

ggplot(moped, aes(frequency, fill = age)) + 
	geom_histogram() + 
	facet_grid(age ~ ., margins = TRUE)

ggplot(moped, aes(number, fill = zone)) + 
	geom_histogram(binwidth = 1) + 
	facet_grid(zone ~ ., margins = TRUE, scales = "free")

ggplot(moped, aes(frequency, fill = zone)) + 
	geom_histogram() + 
	facet_grid(zone ~ ., margins = TRUE, scales = "free")


#########################################
###  modeling:Poisson with log link   ###
#########################################

summary(model.frequency <- glm(number ~ class + age + zone + offset(log(duration)), 
	data = moped, family = poisson))

# use quasipoisson family to stop glm complaining about nonintegral response
summary(glm(number/duration ~ class + age + zone, family=quasipoisson, data=moped, weights=duration))

#chi-square GoF test
with(model.frequency, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))

#test of significance of zone factor
model.f2 <- update(model.frequency, . ~ . - zone)   #... update model.frequency model dropping zone 
anova(model.f2, model.frequency, test = "Chisq")    #... test model differences with chi square test


### Graphical prediction ###
newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.frequency, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + 
	geom_point(size = 6) + 
      facet_wrap(~class) + 
	labs(x = "Zone", y = "Predicted number of claims")

###Prediction of (relative) frequency for each rating factor.
rels <- coef( model.frequency )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )

rel$rels.frequency <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

#########################################
###  modeling: Negative binomial with log link   ###
#########################################

library(MASS)
summary(model.NB <- glm.nb(number ~ class + age + zone + offset(log(duration)), 
	data = moped, control = glm.control(maxit = 25, trace = T)))

##Negative binomial models assume the conditional means are not equal to the conditional variances. 
##This inequality is captured by estimating a dispersion parameter (not shown in the output) that is held constant
##in a Poisson model. Thus, the Poisson model is actually nested in the negative binomial model.
##We can then use a likelihood ratio test to compare these two and test this model assumption. 
##To do this, we will run our model as a Poisson.

(diff2loglik <- as.numeric(2 * (logLik(model.NB) - logLik(model.frequency))))
pchisq(diff2loglik, df = 1, lower.tail = FALSE)

#p-value = 0.675 strongly suggests the Poisson model, without the dispersion parameter, is more appropriate than the negative binomial model (blindly ignoring the warning of number of iterations).


###########################################################################
###########################################################################
#			Claim amount (Severity)
###########################################################################
###########################################################################

ggplot(moped, aes(severity)) + geom_histogram() + 
	scale_x_log10() + facet_grid(class ~ age, margins = TRUE, scales = "free_y")

ggplot(moped, aes(class, severity)) + geom_violin() + geom_jitter(size = 1.5) + 
	scale_y_log10() 

ggplot(moped, aes(age, severity)) + geom_violin() + geom_jitter(size = 1.5) + 
	scale_y_log10() 

ggplot(moped, aes(zone, severity)) + geom_violin() + geom_jitter(size = 1.5) + 
	scale_y_log10()


#########################################
###  modeling: Gamma with log link   ###
#########################################

#There are a couple of points to note here:
#  1.  We will model using a Gamma distribution for the errors. Note the point that this is only one of several plausible candidate distributions.
#  2.  Because we are using the Gamma distribution we need to remove the zero values from the data; we do this using the moped[moped$severity > 0, ] construct.
#  3.  We use the non-canonical log link function even though the canonical function (inverse) gives a slightly better fit (residual deviance 5.9 versus 8.0 on 16 degrees of freedom). The reason is the interpretability.

summary(model.severity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ],
	 family = Gamma("log"), weights = number))

(sev.resid <- as.vector(residuals(model.severity)))

###Some diagnostics###
diagnostics <- as.data.frame(cbind(sev.resid,moped$severity[moped$severity > 0]))
colnames(diagnostics) <- c("residuals","observed")
qplot(observed, residuals, size = I(4), data = diagnostics)
qplot(observed, residuals^2, size = I(4), data = diagnostics)


res_pearson <- residuals(model.severity, "pearson") 
qplot(diagnostics$observed, res_pearson, size = I(4), xlab="Y", ylab="Pearson residuals")
res_deviance <- residuals(model.severity, "deviance")
qplot(diagnostics$observed, res_deviance, size = I(4), xlab="Y", ylab="Deviance residuals")


###Prediction###

newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
#newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
#colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.severity, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) +
 	geom_point(size = 6) + facet_wrap(~class) + 
	labs(x = "Zone", y = "Predicted claim amount")


rels <- coef( model.severity )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
rel$rels.severity <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

###GoF test###
with(model.severity, cbind(res.deviance = deviance, df = df.residual, 
	p = pchisq(deviance, df.residual, lower.tail = FALSE)))

###Submodel testing###
model.s2 <- update(model.severity, . ~ . - zone)
anova(model.s2, model.severity, test = "Chisq")

## simplifued severity model
summary(model.s2)


###Other GLM models might be tested####



###########################################################################
###########################################################################
#			Premium: Combining the models
###########################################################################
###########################################################################

rel$rels.pure.premium <- with(rel, rels.frequency * rels.severity)
print(rel, digits = 2)