### ### MODERN STATISTICAL METHODS (NMST434) ### Exercise class 7 ### rm(list=ls()); # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 1. Identification of parameters by LS and quantile regression # # # # # # # # # # # # # # # # # # # # # # # # # # # # # library(quantreg); # Package for quantile regression # install.packages("quantreg") n <- 1000; # sample size x0 <- rep(1, n); # the first column in regression matrix for intercept x1 <- runif(n); # regressor X <- cbind(x0, x1); # regression matrix beta <- c(0,1); # True value of the parameter beta # # Model A - homoscedastic normal errors eps <- rnorm(n); # N(0,1) Y <- X%*%beta + eps; # Find the limiting values of the estimates (what is being estimated) plot(x1, Y, cex=0.2, main="Homoscedastic normal errors") (fit.lm <- lm(Y~x1)); # (fit.lad <- rq(Y~x1)); tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9); # considered quantiles n.tau <- length(tau.cons); fit.rq <- rq(Y~x1,tau=tau.cons); fit.rq; for(i in 1:length(tau.cons)){ abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile abline(a = beta[1] + qnorm(tau.cons[i]), b = beta[2], col=i,lwd=2, lty="dotted") # true conditional quantile } abline(fit.lm$coef, col="blue", lwd=3, lty="dashed"); legend("topleft", lty = c("dashed", rep("solid", n.tau), "dotted"), col=c("blue", 1:n.tau, "black"), lwd=c(3,rep(2, n.tau),2), legend=c("LS", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles")) # # Signs of the residuals apply(sign(residuals(fit.rq)), 2, table) apply(sign(round(residuals(fit.rq),12)), 2, table) # Approximately tau * 100% of the data points lie below the estimated tau-conditional # quantile function. # # Model B - homoscedastic errors that are exponentially distributed eps <- rexp(n); Y <- X%*%beta + eps; plot(x1, Y, cex=0.2, main = "Homosced. errors that are exponetially distributed") (fit.lm <- lm(Y~x1)); # (fit.lad <- rq(Y~x1)); tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9); fit.rq <- rq(Y~x1,tau=tau.cons); fit.rq; for(i in 1:length(tau.cons)){ abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile abline(a = beta[1] + qexp(tau.cons[i]), b = beta[2], col=i,lwd=2, lty="dotted") # true conditional quantile } abline(fit.lm$coef, col="blue", lwd=3, lty="dashed"); abline(beta[1] + 1, beta[2], col="black",lwd=2, lty="dashed") # true conditional expectation legend("topleft", lty = c("dashed", "dashed", rep("solid", n.tau), "dotted"), col=c("blue", "black", 1:n.tau, "black"), lwd=c(3,2,rep(2, n.tau),2), legend=c("LS", "True condit. expectation", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles")) # # Model C - heteroscedastic normal errors sigma <- function(x1) exp(x1) eps <- rnorm(n)*sigma(x1); Y <- X%*%beta + eps; plot(x1, Y, cex=0.2, main = "Heterosced. normal errors") # Find the limiting values of the estimates (what is being estimated) (fit.lm <- lm(Y~x1)); # (fit.lad <- rq(Y~x1)); tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9); fit.rq <- rq(Y~x1,tau=tau.cons); fit.rq; xgrid <- seq(min(x1), max(x1), length=10000) for(i in 1:length(tau.cons)){ abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile lines(xgrid, beta[1] + beta[2]*xgrid + qnorm(tau.cons[i])*exp(xgrid), col=i,lwd=2, lty="dotted") # true conditional quantile } abline(fit.lm$coef, col="blue", lwd=3, lty="dashed"); legend("topleft", lty = c("dashed", rep("solid", n.tau), "dotted"), col=c("blue", 1:n.tau, "black"), lwd=c(3,rep(2, n.tau),2), legend=c("LS", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles")) # # Model D - heteroscedastic errors that are exponentially distributed sigma <- function(x1) exp(x1) eps <- rexp(n)*sigma(x1); Y <- X%*%beta + eps; # Find the limiting values of the estimates (what is being estimated) plot(x1, Y, cex=0.2, main = "Heterosced. errors that are exp. distributed") (fit.lm <- lm(Y~x1)); tau.cons <- c(0.1, 0.25, 0.5, 0.75, 0.9); fit.rq <- rq(Y~x1,tau=tau.cons); fit.rq; xgrid <- seq(min(x1), max(x1), length=10000) for(i in 1:length(tau.cons)){ abline(a=coef(fit.rq)[1,i],b=coef(fit.rq)[2,i], col=i, lwd=2) # fitted conditional quantile lines(xgrid, beta[1] + beta[2]*xgrid + qexp(tau.cons[i])*exp(xgrid), col=i,lwd=2, lty="dotted") # true conditional quantile } abline(fit.lm$coef, col="blue", lwd=3, lty="dashed"); lines(xgrid, beta[1] + beta[2]*xgrid + exp(xgrid), col="black",lwd=2, lty="dashed") # true conditional expectation legend("topleft", lty = c("dashed", "dashed", rep("solid", n.tau), "dotted"), col=c("blue", "black", 1:n.tau, "black"), lwd=c(3,2,rep(2, n.tau),2), legend=c("LS", "True condit. expectation", paste("Estim. regress. quantiles, tau=", tau.cons), "True cond. quantiles")) # Compare the LS fit fit.lm # wit the imiting values of the least squares estimator. EXX <- matrix(c(1, 1/2,1/2, 1/3), 2, 2); EXY <- c(beta[1] + beta[2]*1/2 + exp(1)-1, beta[1]*1/2+beta[2]*1/3 + 1); solve(EXX, EXY) # For U uniformly distributed on [0,1] # E U = 1/2, E U^2 = 1/3, E e^U = exp(1) - 1, E U e^{U} = 1 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 2. Example: Dataset Hosi0 -- analysing birthweight # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # library(mffSM); data(Hosi0); head(Hosi0) (n <- nrow(Hosi0)); tau.cons <- seq(0.1,0.9, by=0.1); # Considered quantiles # # A. Birth weight vs "age of the mother > 25" # Interpret the estimated coefficients (fit <- rq(bweight~I(mage >= 30), data=Hosi0, tau=tau.cons)); # A closer inspection shows the "warning message" is for tau = 0.5 rq(bweight~I(mage >= 30), data=Hosi0, tau=0.5); (temp <- table(I(Hosi0$mage >= 30))) temp[1]*tau.cons; temp[2]*tau.cons; plot(fit); # (Intercept): an estimate of the quantile function of bweight|mage<=25 # I(mage>30)TRUE: correction from the intercept to get the quantile function # of bweight|mage>30 plot(summary(fit)); # with pointwise confidence intervals # Test of equality of the slopes anova(fit) # help(anova.rq) # anova tests for regression quantiles: # when applied to a single model with different tau's (as above): # test whether for all tau's the slope parameters are equal # This is in our situation (two-sample problem) equivalent with # a test whether the two distributions bweight|mage<=25 and # bweight|mage>25 are just shifted by a constant or not # # B. Birth weight vs age of the father (fit <- rq(bweight~I(fage-mean(fage)), data=Hosi0, tau=tau.cons)); summary(fit) # summary(fit, se="boot") plot(summary(fit)); anova(fit) # help(anova.rq) # anova tests for regression quantiles: # when applied to a single model with different tau's (as above): # test whether for all tau's the slope parameters are equal # plot n <- nrow(Hosi0); # sample size plot(Hosi0$fage, Hosi0$bweight, cex=0.1); table(Hosi0$fage); table(Hosi0$bweight) # jittered values for a nicer plot plot(Hosi0$fage+(runif(n)-0.5), Hosi0$bweight+10*(runif(n)-0.5), cex=0.1, xlab="Age of father", ylab="Birth weight") # estimated conditinal quantiles for(i in 1:length(tau.cons)){ abline(fit$coeff[,i], col=i,lwd=2); } # # Attempt to find out how the estimates of the standard errors are computed (fit.lad <- rq(bweight~I(fage-mean(fage)), data=Hosi0)) (sfit.lad <- summary(fit.lad, se="iid")) # Consider tau = 0.5 tau0 = 0.5 res <- residuals(fit.lad) # hn <- n^(-1/5)*(4.5*dnorm(0)^4)^(1/5) # Hall-Sheather's bandwidth (hn <- n^(-1/3)*qnorm(0.975)^(2/3)*(1.5*dnorm(0)^2)^(1/3)); bandwidth.rq(0.5, n, hs = T) # sparsity estimator as in the course notes sparsity.cn <- (quantile(res, tau0+hn)-quantile(res, tau0-hn))/(2*hn) sparsity.cn # sparsity estimator as in "summary.rq" p <- 2; (n.h <- max(p + 1, ceiling(n * bandwidth.rq(tau0, n, hs = T)))); pz <- sum(abs(res) < 1e-12) ir <- (pz + 1):(n.h + pz + 1) ord.resid <- sort(res[order(abs(res))][ir]) xt <- ir/(n - p) (sparsity.rq <- rq(ord.resid ~ xt)$coef[2]) plot(sort(res)) iii <- floor(n*(tau0-hn/2)):ceiling(n*(tau0+hn/2)) plot(iii/n, sort(res)[iii], xlab="tau", ylab="quantile function of residuals") abline(h=0, lty="dotted") fit0 <- rq(sort(res)[iii]~I(iii/n)) abline(coef(fit0), col="blue") coef(fit0) sparsity.cn/sparsity.rq # Calculation as in the course notes X <- cbind(1, Hosi0$fage - mean(Hosi0$fage)); (SE <- sqrt(tau0*(1-tau0))*sparsity.cn*sqrt(diag(solve(t(X)%*%X)))) # To be compared with (sfit.lad[[3]]) # Ratio of our implemnation and implementation in summary.rq SE/(sfit.lad[[3]])[,2] # Which corresponds to the ratio of sparsity estimates sparsity.cn/sparsity.rq summary(fit.lad, se="iid") summary(fit.lad, se="boot"); # boostrap method # # C. Birth weight vs age of mother and age of father summary(lm(bweight ~ I(mage-mean(mage))+ I(fage-mean(fage)),data=Hosi0)); fit <- rq(bweight ~ I(mage-mean(mage))+ I(fage-mean(fage)), data=Hosi0, tau=tau.cons) summary(fit); plot(summary(fit)); anova(fit) # above we have again just a test of equality of slopes # (both of them simultaneously) ## anova test of a sub-model ## apply anova.rq() only for a sequence of nested models with single tau itau <- 2 (tau0 = tau.cons[itau]) anova(rq(bweight ~ +I(fage-mean(fage)), data=Hosi0, tau=tau0), rq(bweight ~ I(mage-mean(mage))+I(fage-mean(fage)),data=Hosi0, tau=tau0)) ## compare with summary(fit)[itau]