###
###  MODERN STATISTICAL METHODS (NMST434)
###  Exercise class 7 
###

rm(list=ls());
 

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# # 1. Identification of parameters by LS and quantile regression 
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

library(quantreg); # Package for quantile regression 
# install.packages("quantreg")

n <- 1000; # sample size 

x0 <- rep(1, n);        # the first column in regression matrix for intercept 
x1 <- runif(n);         # regressor
X <- cbind(x0, x1);     # regression matrix 

beta <- c(0,1);         # True value of the parameter beta 


# # Model A - homoscedastic normal errors 
eps <- rnorm(n); # N(0,1)

Y <- X%*%beta + eps;

# Find the limiting values of the estimates (what is being estimated)
plot(x1, Y, cex=0.2, main="Homoscedastic normal errors")
(fit.lm <- lm(Y~x1));
# (fit.lad <- rq(Y~x1));
tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9); # considered quantiles 
n.tau <- length(tau.cons);
fit.rq <- rq(Y~x1,tau=tau.cons);
fit.rq;

for(i in 1:length(tau.cons)){
  abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile
  abline(a = beta[1] + qnorm(tau.cons[i]), b = beta[2], col=i,lwd=2, lty="dotted") # true conditional quantile
}

abline(fit.lm$coef, col="blue", lwd=3, lty="dashed");

legend("topleft", lty = c("dashed", rep("solid", n.tau), "dotted"), col=c("blue", 1:n.tau, "black"), lwd=c(3,rep(2, n.tau),2), legend=c("LS", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles"))

# # Signs of the residuals 
apply(sign(residuals(fit.rq)), 2, table)
apply(sign(round(residuals(fit.rq),12)), 2, table)

# Approximately  tau * 100% of the data points lie below the estimated tau-conditional 
# quantile function. 


# # Model B - homoscedastic errors that are exponentially distributed 
eps <- rexp(n);

Y <- X%*%beta + eps;

plot(x1, Y, cex=0.2, main = "Homosced. errors that are exponetially distributed")
(fit.lm <- lm(Y~x1));
# (fit.lad <- rq(Y~x1));
tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9);
fit.rq <- rq(Y~x1,tau=tau.cons);
fit.rq;


for(i in 1:length(tau.cons)){
  abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile
  abline(a = beta[1] + qexp(tau.cons[i]), b = beta[2], col=i,lwd=2, lty="dotted") # true conditional quantile
}

abline(fit.lm$coef, col="blue", lwd=3, lty="dashed");
abline(beta[1] + 1, beta[2], col="black",lwd=2, lty="dashed") # true conditional expectation

legend("topleft", lty = c("dashed", "dashed",  rep("solid", n.tau), "dotted"), col=c("blue", "black", 1:n.tau, "black"), lwd=c(3,2,rep(2, n.tau),2), legend=c("LS", "True condit. expectation", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles"))



# # Model C - heteroscedastic normal errors 
sigma <- function(x1) exp(x1)
eps <- rnorm(n)*sigma(x1);

Y <- X%*%beta + eps;
plot(x1, Y, cex=0.2, main = "Heterosced. normal errors")

# Find the limiting values of the estimates (what is being estimated)
(fit.lm <- lm(Y~x1));
# (fit.lad <- rq(Y~x1));

tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9);
fit.rq <- rq(Y~x1,tau=tau.cons);
fit.rq;

xgrid <- seq(min(x1), max(x1), length=10000)

for(i in 1:length(tau.cons)){
  abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile
  lines(xgrid, beta[1] + beta[2]*xgrid + qnorm(tau.cons[i])*exp(xgrid), col=i,lwd=2, lty="dotted") # true conditional quantile
}

abline(fit.lm$coef, col="blue", lwd=3, lty="dashed");
legend("topleft", lty = c("dashed", rep("solid", n.tau), "dotted"), col=c("blue", 1:n.tau, "black"), lwd=c(3,rep(2, n.tau),2), legend=c("LS", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles"))



# # Model D - heteroscedastic errors that are exponentially distributed 
sigma <- function(x1) exp(x1)
eps <- rexp(n)*sigma(x1);

Y <- X%*%beta + eps;

# Find the limiting values of the estimates (what is being estimated)
plot(x1, Y, cex=0.2, main = "Heterosced. errors that are exp. distributed")
(fit.lm <- lm(Y~x1));

tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9);
fit.rq <- rq(Y~x1,tau=tau.cons);
fit.rq;

xgrid <- seq(min(x1), max(x1), length=10000)

for(i in 1:length(tau.cons)){
  abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile
  lines(xgrid, beta[1] + beta[2]*xgrid + qexp(tau.cons[i])*exp(xgrid), col=i,lwd=2, lty="dotted") # true conditional quantile
}

abline(fit.lm$coef, col="blue", lwd=3, lty="dashed");
 lines(xgrid, beta[1] + beta[2]*xgrid + exp(xgrid), col="black",lwd=2, lty="dashed") # true conditional expectation
 
legend("topleft", lty = c("dashed", "dashed",  rep("solid", n.tau), "dotted"), col=c("blue", "black", 1:n.tau, "black"), lwd=c(3,2,rep(2, n.tau),2), legend=c("LS", "True condit. expectation", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles"))



# Compare the LS fit 
fit.lm
# wit the imiting values of the least squares estimator. 
EXX <- matrix(c(1, 1/2,1/2, 1/3), 2, 2);
EXY <- c(beta[1] + beta[2]*1/2 + exp(1)-1, beta[1]*1/2+beta[2]*1/3 + 1);
solve(EXX, EXY)

# For U uniformly distributed on [0,1]
# E U = 1/2, E U^2 = 1/3, E e^U = exp(1) - 1, E U e^{U} = 1



# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #  
# # 2. Example: Dataset Hosi0 -- analysing birthweight 
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

library(mffSM); 
data(Hosi0);

head(Hosi0)
(n <- nrow(Hosi0));

tau.cons <- seq(0.1,0.9, by=0.1); # Considered quantiles 

# # A. Birth weight vs "age of the mother > 25"

# Interpret the estimated coefficients 
(fit <- rq(bweight~I(mage >= 30), data=Hosi0, tau=tau.cons));

# A closer inspection shows the "warning message" is for tau = 0.5
rq(bweight~I(mage >= 30), data=Hosi0, tau=0.5);

(temp <- table(I(Hosi0$mage >= 30)))
temp[1]*tau.cons;
temp[2]*tau.cons;

plot(fit);
# (Intercept): an estimate of the quantile function of bweight|mage<=25
# I(mage>30)TRUE: correction from the intercept to get the quantile function
#                 of bweight|mage>30
plot(summary(fit)); # with pointwise confidence intervals 

# Test of equality of the slopes 
anova(fit)
# help(anova.rq)
# anova tests for regression quantiles:
#    when applied to a single model with different tau's (as above):
#         test whether for all tau's the slope parameters are equal
#         This is in our situation (two-sample problem) equivalent with
#                 a test whether the two distributions bweight|mage<=25 and 
#                 bweight|mage>25 are just shifted by a constant or not 

# # B. Birth weight vs age of the father
(fit <- rq(bweight~I(fage-mean(fage)), data=Hosi0, tau=tau.cons));
summary(fit)
# summary(fit, se="boot")
plot(summary(fit));

anova(fit)
# help(anova.rq)
# anova tests for regression quantiles:
#    when applied to a single model with different tau's (as above):
#         test whether for all tau's the slope parameters are equal

# plot
n <- nrow(Hosi0); # sample size 
plot(Hosi0$fage, Hosi0$bweight, cex=0.1);
table(Hosi0$fage);
table(Hosi0$bweight)

# jittered values for a nicer plot
plot(Hosi0$fage+(runif(n)-0.5), Hosi0$bweight+10*(runif(n)-0.5), 
     cex=0.1, xlab="Age of father", ylab="Birth weight")

# estimated conditinal quantiles
for(i in 1:length(tau.cons)){
  abline(fit$coeff[,i], col=i,lwd=2);
}


# # Attempt to find out how the estimates of the standard errors are computed
(fit.lad <- rq(bweight~I(fage-mean(fage)), data=Hosi0))
(sfit.lad <-  summary(fit.lad, se="iid"))

# Consider tau = 0.5
tau0 = 0.5
res <- residuals(fit.lad) 
# hn <- n^(-1/5)*(4.5*dnorm(0)^4)^(1/5)
# Hall-Sheather's bandwidth
(hn <- n^(-1/3)*qnorm(0.975)^(2/3)*(1.5*dnorm(0)^2)^(1/3)); 

bandwidth.rq(0.5, n, hs = T)

# sparsity estimator as in the course notes 
sparsity.cn <- (quantile(res, tau0+hn)-quantile(res, tau0-hn))/(2*hn)
sparsity.cn

#  sparsity estimator as in "summary.rq" 
p <- 2;
(n.h <- max(p + 1, ceiling(n * bandwidth.rq(tau0, n, hs = T))));

pz <- sum(abs(res) < 1e-12)
ir <- (pz + 1):(n.h + pz + 1)
ord.resid <- sort(res[order(abs(res))][ir])
xt <- ir/(n - p)
(sparsity.rq <- rq(ord.resid ~ xt)$coef[2])

plot(sort(res))
iii <- floor(n*(tau0-hn/2)):ceiling(n*(tau0+hn/2))
plot(iii/n, sort(res)[iii], xlab="tau", ylab="quantile function of residuals")
abline(h=0, lty="dotted")
fit0 <- rq(sort(res)[iii]~I(iii/n))
abline(coef(fit0), col="blue")
coef(fit0)


sparsity.cn/sparsity.rq


# Calculation as in the course notes 
X <- cbind(1, Hosi0$fage - mean(Hosi0$fage));
(SE <- sqrt(tau0*(1-tau0))*sparsity.cn*sqrt(diag(solve(t(X)%*%X))))

  
# To be compared with 
(sfit.lad[[3]])

# Ratio of our implemnation and implementation in summary.rq
SE/(sfit.lad[[3]])[,2]

# Which corresponds to the ratio of sparsity estimates
sparsity.cn/sparsity.rq

summary(fit.lad, se="iid")
summary(fit.lad, se="boot"); # boostrap method


# # C. Birth weight vs age of mother and age of father

summary(lm(bweight ~ I(mage-mean(mage))+ I(fage-mean(fage)),data=Hosi0));

fit <- rq(bweight ~ I(mage-mean(mage))+ I(fage-mean(fage)), data=Hosi0, tau=tau.cons)
summary(fit);
plot(summary(fit));

anova(fit) 
# above we have again just a test of equality of slopes 
# (both of them simultaneously)

## anova test of a sub-model
## apply anova.rq() only for a sequence of nested models with single tau
itau <- 2
(tau0 = tau.cons[itau])
anova(rq(bweight ~ +I(fage-mean(fage)), data=Hosi0, tau=tau0), rq(bweight ~ I(mage-mean(mage))+I(fage-mean(fage)),data=Hosi0, tau=tau0))
## compare with
summary(fit)[itau]