## 3. INFERRING POPULATION CHARACTERISTICS VIA SURVEY RESEARCH

## 3.3 MEASURING SUPPORT FOR BREXIT

## Set the working directory
setwd("~/Desktop/DSS") # example of setwd() for Mac 
setwd("C:/user/Desktop/DSS") # example for Windows

## Load the dataset
bes <- read.csv("BES.csv") # reads and stores data

## Understand the data
## (Read about description of variables and unit of observation)
head(bes) # shows first observations

## Identify the types of variables included
## (character vs. numeric; binary vs. non-binary)

## Identify the number of observations
dim(bes) # provides dimensions of dataframe: rows, columns

## Predicting the referendum outcome
## Frequency tables
table(bes$vote) # creates frequency table of vote

## Tables of proportions 
## option a: create frequency table first
freq_table <- table(bes$vote) # object with frequency table
prop.table(freq_table) # creates table of proportions

## option b: do it all at once
prop.table(table(bes$vote)) # creates table of proportions

## 3.4 WHO SUPPORTED BREXIT?

## Handling missing data
table(bes$education, exclude=NULL) # table() including NAs
mean(bes$leave) # mean() without removing NAs
mean(bes$leave, na.rm=TRUE) # mean() removing NAs
bes1 <- na.omit(bes) # removes observations with NAs
head(bes) # shows first observations of original dataframe
head(bes1) # shows first observations of new dataframe
dim(bes) # provides dimensions (rows, columns) of original dataframe
dim(bes1) # provides dimensions (rows, columns) of new dataframe

## Two-way frequency tables
table(bes1$leave, bes1$education) 

## Two-way tables of proportions
prop.table(table(bes1$leave, bes1$education)) 
prop.table(table(bes1$leave, bes1$education), margin=1) # with margin=1
prop.table(table(bes1$leave, bes1$education), margin=2) # with margin=2

## Histograms
hist(bes1$age) # creates histogram of all observations in age
hist(bes1$age[bes1$leave==0]) # creates histogram for non−supporters 
hist(bes1$age[bes1$leave==1]) # creates histogram for supporters

## Density histograms
## Create histograms
hist(bes1$age[bes1$education==1]) # for respondents w/ no qualifications
hist(bes1$age[bes1$education==4]) # for respondents w/ undergraduate degree
## Create density histograms
hist(bes1$age[bes1$education==1], freq=FALSE) # for respondents w/ no qualifications
hist(bes1$age[bes1$education==4], freq=FALSE) # for respondents w/ undergraduate degree

## Create density histograms
hist(bes1$age[bes1$leave==0], freq=FALSE) # for non-supporters
hist(bes1$age[bes1$leave==1], freq=FALSE) # for supporters

## Descriptive statistics
mean(bes1$age[bes1$leave==0]) # mean age of non−supporters
mean(bes1$age[bes1$leave==1]) # mean age of supporters
median(bes1$age[bes1$leave==0]) # median age of non−supporters
median(bes1$age[bes1$leave==1]) # median age of supporters
sd(bes1$age[bes1$leave==0]) # sd of age for non−supporters
sd(bes1$age[bes1$leave==1]) # sd of age for supporters
var(bes1$age[bes1$leave==1]) # variance of age for supporters
sd(bes1$age[bes1$leave==1])^2 # square of sd of age for supporters
sqrt(var(bes1$age[bes1$leave==1])) # square root of variance of age for supporters

## 3.5 RELATIONSHIP BETWEEN EDUCATION AND THE LEAVE VOTE IN THE ENTIRE UK

## Set the working directory
setwd("~/Desktop/DSS") # example of setwd() for Mac 
setwd("C:/user/Desktop/DSS") # example for Windows

## Load the dataset
dis <- read.csv("UK_districts.csv") # reads and stores data

## Understand the data
## (Read about description of variables and unit of observation)
head(dis) # shows first observations

## Identify the types of variables included
## (character vs. numeric; binary vs. non-binary)

## Identify the number of observations
dim(dis) # provides dimensions of dataframe: rows, columns

## Handling missing data
dis1 <- na.omit(dis) # removes observations with NAs
dim(dis1) # provides dimensions (rows, columns) of new dataframe

## Scatter plot
plot(dis1$high_education, dis1$leave) # creates scatter plot, required arguments in this order: X, Y
plot(x=dis1$high_education, y=dis1$leave) # creates scatter plot, using names of the arguments 
plot(y=dis1$leave, x=dis1$high_education) # creates scatter plot, using names of the arguments

## Add straight dashed lines to the most recent graph
abline(v=mean(dis1$high_education), lty="dashed") # vertical
abline(h=mean(dis1$leave), lty="dashed") # horizontal

## Correlation
cor(dis1$high_education, dis1$leave) # calculates correlation between X and Y
cor(dis1$leave, dis1$high_education) # calculates correlation between Y and X