###
###  MODERN STATISTICAL METHODS (NMST434)
###  Exercise class Nr. 6 
###

rm(list=ls());

# # # # # # # # # # # # # # # # # # # # # # # # # # # 
# # # 1. Robust estimators of location 
# # # # # # # # # # # # # # # # # # # # # # # # # # # 

# library(robustbase);
library(MASS)

# A. Basic illustration of mean, median and Huber's estimator 

set.seed(20220323)
# Sample size 
n <- 100;

# # A random sample from contaminated normal distribution
u <- runif(n);
x <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9);
# hist(x)
hist(x, breaks=20) 
 
mean(x);
# [1] 0.04090808
median(x);
# [1] -0.08085571

# # Huber studentized estimator with 
# MAD taken as the estimator of the scale, k=1.5 
# psi <- function(x, k=1.345) x*(abs(x) <= k) + k*sign(x)*(abs(x) > k)
huber(x); 
# $mu
# [1] -0.02661503
# 
# $s
# [1] 0.837996

# MAD
mad(x)
median(abs(x - median(x)))/qnorm(0.75)

# Other estimates of scale 
sd(x);

# Interquartile range 
IQR(x);
(quantile(x,0.75)-quantile(x,0.25));
# Standardized IQR
(quantile(x,0.75)-quantile(x,0.25))/(qnorm(0.75)-qnorm(0.25));

# B. Very heavily contaminated sample (breakdown point illustration)
x0 <- rnorm(51);
x0[52:100] <- 1e12;
hist(x0[1:51]);

mean(x0);
median(x0);
huber(x0);

mean(x0);
# [1] 4.9e+11
median(x0);
# [1] 1.970713
huber(x0);
# $mu
# [1] 8.635593
# 
# $s
# [1] 5.697667


# Adding one more contaminated point so that exactly half of the points are contaminated 
x1 <- x0;
x1[51] <- 1e12;
huber(x1);
median(x1);


# # C. Simulation study 
nopak <- 1000;

MEAN <- numeric(nopak);
MEDIAN <- numeric(nopak);
HUBER <- numeric(nopak);

for(i in 1:nopak){
	u <- runif(n);
# 	x <- rexp(n)*sign(u - 0.5)
	x <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9);

# x <- rnorm(n);
	
	MEAN[i] <- mean(x);
	MEDIAN[i] <- median(x);
	HUBER[i] <- huber(x)$mu;
}

assess.theta <- function(theta, digits=3){
  ret <- c(min(theta), mean(theta), median(theta), max(theta), sd(theta), IQR(theta));
  names(ret) <- c("min", "mean", "median", "max", "sd", "IQR");
  return(round(ret, digits=3))
}

# print("Performance of mean:")
# assess.theta(MEAN);
# print("Performance of median:")
# assess.theta(MEDIAN);
# print("Performance of (studentized) Huber estimate:")
# assess.theta(HUBER);

res <- rbind(assess.theta(MEAN), assess.theta(MEDIAN), assess.theta(HUBER));
rownames(res) <- c("MEAN", "MEDIAN", "HUBER");
res



# # # # # # # # # # # # # # # # # 
# D. A sample from a not-symmetric distributions 
mu <- log(32.870);
sigma <- sqrt(2*(log(38.525) - mu));

# Sample from a log-normal distribution 
n <- 10000;
x <- exp(rnorm(n, mean=mu, sd=sigma));
hist(x, breaks=50)

# Why do the following estimators differ? 
mean(x);
# [1] 38.50325
median(x);
# [1] 33.25538
huber(x);
# $mu
# [1] 35.69541
# 
# $s
# [1] 17.59605


# # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# # # 2. Compare the performance of different 
# # #  estimators of parameters in linear models 
# # # # # # # # # # # # # # # # # # # # # # # # # # # # 


# A. Simulation study 

# install.packages("quantreg")
library(quantreg);

nopak <- 10000;
n <- 100;
beta1 <- 1;
beta0 <- 1;

beta.lm <- numeric(nopak);
beta.lad <- numeric(nopak);
beta.huber <- numeric(nopak);

for(i in 1:nopak){
  x1 <- runif(n); # regressor  
# #   Errors  
#   eps <- rnorm(n);  # N(0,1)
# u <- runif(n);
#   eps <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9);  # N(0,1)
  eps <- rexp(n);   # Exp(1)
#   eps <- sign(runif(n) - 0.5)*rexp(n); # Laplace 
  Y <- beta0 + beta1*x1 + eps;  
  beta.lm[i] <- lm(Y~x1)$coef[2];
  beta.lad[i] <- rq(Y~x1)$coef[2];
  beta.huber[i] <- rlm(Y~x1)$coef[2]; 
}


res <- rbind(assess.theta(beta.lm), assess.theta(beta.lad), assess.theta(beta.huber));
rownames(res) <- c("LS", "LAD", "HUBER");
res


# Try different distributions (normal N(0,1), exponential, Laplace, t-distribution) for the errors in linear model and compare the performances of the estimators of parameter beta.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

# B. Case study - conditionally exponential distribution. 

# # Conditional distribution of Y given X = x  is Exponential with lambda(x)=1/(2*(x + 1)), i.e. 
# # f(y) = lambda e^{- lambda y} Ind{y > 0}

set.seed(10)
n <- 10000;
# x1 <- sort(2*rexp(n));
x1 <- sort(2*runif(n));
Y <- rexp(n, rate=0.5/(1+x1));
plot(x1, Y, cex=0.1)

# Try to explain the estimates of the parameters of the linear models from different methods 

print("LS:")
(fit.lm <- lm(Y~x1));
# Call:
# lm(formula = Y ~ x1)
# 
# Coefficients:
# (Intercept)           x1  
#       2.006        2.005 
      
print("LAD:")
(fit.lad <- rq(Y~x1));
# Call:
# rq(formula = Y ~ x1)
# 
# Coefficients:
# (Intercept)          x1 
#    1.451852    1.401637 

# Huber, k=1.345 
print("Huber:"); 
(fit.huber <- rlm(Y~x1));
# rlm(formula = Y ~ x1)
# Converged in 7 iterations
# 
# Coefficients:
# (Intercept)          x1 
#    2.010275    1.351537 
# 
# Degrees of freedom: 10000 total; 9998 residual
# Scale estimate: 3.01 


beta.iter <- coef(fit.lm);
n.iter <- 10;
X <- cbind(1, x1);

psi <- function(x, k=1.345) x*(abs(x) <= k) + k*sign(x)*(abs(x) > k)
 
#  Manual calculation just to illustrate the iterative algorithm 
 for(i in 1:n.iter){
  res <- Y - X%*%beta.iter;
  s <- mad(res,0);
  scaled.res <- res/s;
  W <- psi(scaled.res)/scaled.res;
  beta.iter <- coef(lm(Y~x1, weights=W));
  print(c(beta.iter, s))
#   beta.iter <- solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%Y; 
 }
 
 


# Scale estimate is uncentred MAD of the residuals i.e.  
mad(fit.huber$residuals, 0)
median(abs(fit.huber$residuals))/qnorm(0.75)

# Comparison of the estimators 
res <- round(rbind(coef(fit.lm), coef(fit.lad), coef(fit.huber)),2);
rownames(res) <- c("LS", "LAD", "Huber")
print(res)



# Figure 
plot(x1, Y, cex=0.1);
abline(fit.lm, col="black", lwd=2);
abline(fit.lad, col="blue", lwd=2);
abline(fit.huber, col="red", lwd=2);
legend(0,30, col=c("black", "blue", "red"), lwd=rep(2,3), legend=c("LS", "LAD", "Huber"))


# # # # 

n <- 100;
x <- rnorm(n);
# x <- sort(rnorm(n));
# u <- runif(n);

Y <- 1 + 2*x + rnorm(n);
# Y[10:20] <- 1 - 2*x[10:20]
Y[1:10] <- -Y[1:10] + 10

plot(x,Y)

summary(fit.lm <- lm(Y~x))
summary(fit.lad <- rq(Y~x))
summary(fit.huber <- rlm(Y~x))

plot(x, Y);
abline(fit.lm, col="black", lwd=2);
abline(fit.lad, col="blue", lwd=2);
abline(fit.huber, col="red", lwd=2);
abline(a=1, b=2, lty="dotted");
legend("bottomright", col=c("black", "blue", "red"), lwd=rep(2,3), legend=c("LS", "LAD", "Huber"))