### ### MODERN STATISTICAL METHODS (NMST434) ### Exercise class Nr. 6 ### rm(list=ls()); # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 1. Robust estimators of location # # # # # # # # # # # # # # # # # # # # # # # # # # # # library(robustbase); library(MASS) # A. Basic illustration of mean, median and Huber's estimator set.seed(20220323) # Sample size n <- 100; # # A random sample from contaminated normal distribution u <- runif(n); x <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9); # hist(x) hist(x, breaks=20) mean(x); # [1] 0.04090808 median(x); # [1] -0.08085571 # # Huber studentized estimator with # MAD taken as the estimator of the scale, k=1.5 # psi <- function(x, k=1.345) x*(abs(x) <= k) + k*sign(x)*(abs(x) > k) huber(x); # $mu # [1] -0.02661503 # # $s # [1] 0.837996 # MAD mad(x) median(abs(x - median(x)))/qnorm(0.75) # Other estimates of scale sd(x); # Interquartile range IQR(x); (quantile(x,0.75)-quantile(x,0.25)); # Standardized IQR (quantile(x,0.75)-quantile(x,0.25))/(qnorm(0.75)-qnorm(0.25)); # B. Very heavily contaminated sample (breakdown point illustration) x0 <- rnorm(51); x0[52:100] <- 1e12; hist(x0[1:51]); mean(x0); median(x0); huber(x0); mean(x0); # [1] 4.9e+11 median(x0); # [1] 1.970713 huber(x0); # $mu # [1] 8.635593 # # $s # [1] 5.697667 # Adding one more contaminated point so that exactly half of the points are contaminated x1 <- x0; x1[51] <- 1e12; huber(x1); median(x1); # # C. Simulation study nopak <- 1000; MEAN <- numeric(nopak); MEDIAN <- numeric(nopak); HUBER <- numeric(nopak); for(i in 1:nopak){ u <- runif(n); # x <- rexp(n)*sign(u - 0.5) x <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9); # x <- rnorm(n); MEAN[i] <- mean(x); MEDIAN[i] <- median(x); HUBER[i] <- huber(x)$mu; } assess.theta <- function(theta, digits=3){ ret <- c(min(theta), mean(theta), median(theta), max(theta), sd(theta), IQR(theta)); names(ret) <- c("min", "mean", "median", "max", "sd", "IQR"); return(round(ret, digits=3)) } # print("Performance of mean:") # assess.theta(MEAN); # print("Performance of median:") # assess.theta(MEDIAN); # print("Performance of (studentized) Huber estimate:") # assess.theta(HUBER); res <- rbind(assess.theta(MEAN), assess.theta(MEDIAN), assess.theta(HUBER)); rownames(res) <- c("MEAN", "MEDIAN", "HUBER"); res # # # # # # # # # # # # # # # # # # D. A sample from a not-symmetric distributions mu <- log(32.870); sigma <- sqrt(2*(log(38.525) - mu)); # Sample from a log-normal distribution n <- 10000; x <- exp(rnorm(n, mean=mu, sd=sigma)); hist(x, breaks=50) # Why do the following estimators differ? mean(x); # [1] 38.50325 median(x); # [1] 33.25538 huber(x); # $mu # [1] 35.69541 # # $s # [1] 17.59605 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 2. Compare the performance of different # # # estimators of parameters in linear models # # # # # # # # # # # # # # # # # # # # # # # # # # # # # A. Simulation study # install.packages("quantreg") library(quantreg); nopak <- 10000; n <- 100; beta1 <- 1; beta0 <- 1; beta.lm <- numeric(nopak); beta.lad <- numeric(nopak); beta.huber <- numeric(nopak); for(i in 1:nopak){ x1 <- runif(n); # regressor # # Errors # eps <- rnorm(n); # N(0,1) # u <- runif(n); # eps <- rnorm(n)*(u < 0.9) + 5*rnorm(n)*(u >= 0.9); # N(0,1) eps <- rexp(n); # Exp(1) # eps <- sign(runif(n) - 0.5)*rexp(n); # Laplace Y <- beta0 + beta1*x1 + eps; beta.lm[i] <- lm(Y~x1)$coef[2]; beta.lad[i] <- rq(Y~x1)$coef[2]; beta.huber[i] <- rlm(Y~x1)$coef[2]; } res <- rbind(assess.theta(beta.lm), assess.theta(beta.lad), assess.theta(beta.huber)); rownames(res) <- c("LS", "LAD", "HUBER"); res # Try different distributions (normal N(0,1), exponential, Laplace, t-distribution) for the errors in linear model and compare the performances of the estimators of parameter beta. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # B. Case study - conditionally exponential distribution. # # Conditional distribution of Y given X = x is Exponential with lambda(x)=1/(2*(x + 1)), i.e. # # f(y) = lambda e^{- lambda y} Ind{y > 0} set.seed(10) n <- 10000; # x1 <- sort(2*rexp(n)); x1 <- sort(2*runif(n)); Y <- rexp(n, rate=0.5/(1+x1)); plot(x1, Y, cex=0.1) # Try to explain the estimates of the parameters of the linear models from different methods print("LS:") (fit.lm <- lm(Y~x1)); # Call: # lm(formula = Y ~ x1) # # Coefficients: # (Intercept) x1 # 2.006 2.005 print("LAD:") (fit.lad <- rq(Y~x1)); # Call: # rq(formula = Y ~ x1) # # Coefficients: # (Intercept) x1 # 1.451852 1.401637 # Huber, k=1.345 print("Huber:"); (fit.huber <- rlm(Y~x1)); # rlm(formula = Y ~ x1) # Converged in 7 iterations # # Coefficients: # (Intercept) x1 # 2.010275 1.351537 # # Degrees of freedom: 10000 total; 9998 residual # Scale estimate: 3.01 beta.iter <- coef(fit.lm); n.iter <- 10; X <- cbind(1, x1); psi <- function(x, k=1.345) x*(abs(x) <= k) + k*sign(x)*(abs(x) > k) # Manual calculation just to illustrate the iterative algorithm for(i in 1:n.iter){ res <- Y - X%*%beta.iter; s <- mad(res,0); scaled.res <- res/s; W <- psi(scaled.res)/scaled.res; beta.iter <- coef(lm(Y~x1, weights=W)); print(c(beta.iter, s)) # beta.iter <- solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%Y; } # Scale estimate is uncentred MAD of the residuals i.e. mad(fit.huber$residuals, 0) median(abs(fit.huber$residuals))/qnorm(0.75) # Comparison of the estimators res <- round(rbind(coef(fit.lm), coef(fit.lad), coef(fit.huber)),2); rownames(res) <- c("LS", "LAD", "Huber") print(res) # Figure plot(x1, Y, cex=0.1); abline(fit.lm, col="black", lwd=2); abline(fit.lad, col="blue", lwd=2); abline(fit.huber, col="red", lwd=2); legend(0,30, col=c("black", "blue", "red"), lwd=rep(2,3), legend=c("LS", "LAD", "Huber")) # # # # n <- 100; x <- rnorm(n); # x <- sort(rnorm(n)); # u <- runif(n); Y <- 1 + 2*x + rnorm(n); # Y[10:20] <- 1 - 2*x[10:20] Y[1:10] <- -Y[1:10] + 10 plot(x,Y) summary(fit.lm <- lm(Y~x)) summary(fit.lad <- rq(Y~x)) summary(fit.huber <- rlm(Y~x)) plot(x, Y); abline(fit.lm, col="black", lwd=2); abline(fit.lad, col="blue", lwd=2); abline(fit.huber, col="red", lwd=2); abline(a=1, b=2, lty="dotted"); legend("bottomright", col=c("black", "blue", "red"), lwd=rep(2,3), legend=c("LS", "LAD", "Huber"))