#MULTIPLE REGRESSION IN R SCRIPT FILE.
#Associated files: Birthweight.csv, Multiple regression in R worksheet and Further regression in R worksheet.
##########################################################.

#STATSTUTOR COMMUNITY PROJECT.
#Sofia Maria Karadimitriou and Ellen Marshall, University of Sheffield.
#Reviewer: Jim Bull, University of Swansea.
#######################################################.

#Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR.  
#If your file is saved as a standard Excel file, save it as a csv file first.  
#You will need to change the command depending on where you have saved the file.
birthweightR<-read.csv("E:\\Birthweight reduced.csv",header=T)

#Tell R we are using the birthweight dataset until further notice using attach.
#This means that 'Gestation' can be used instead of birthweightR$Gestation.
attach(birthweightR) 

#R assumes all numeric values are continuous so tell it that 'smoker' is a factor. 
#and attach labels to the categories (for example 0 in smoker means the mother is a non-smoker).
# The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).
smoker<-factor(smoker,c(0,1),labels=c('Non-smoker','Smoker'))

#Producing a scatterplot matrix of all independents and the dependent with different colours by Group.  
#pch changes the shape of the scatter, (pch=4 gives crosses) and col changes the colour depending on smoker.
pairs(~Birthweight+Gestation+mheight+mppwt,main='Birth weight scatterplots',col=c('red','blue')[smoker],pch=c(1,4)[smoker])

#Calculating Pearson's correlation coefficient gives a measure of the strength of a relationship.
#First the correlations between each pair of variables, rounded to 2dp.
round(cor(cbind(Birthweight,Gestation,mppwt,mheight)),2)

#Fit the regression model using the lm(dependent~Independent) command and give it a name (reg1).
reg2<-lm(Birthweight~Gestation+smoker+mppwt)
#Request the regression output.
summary(reg2)

#To check the assumptions using plots, first tell R you 2 plots next to each other.
par(mfrow=c(1,2))
#First produce a histogram of standardised residuals to check the assumption of normality.
hist(resid(reg2),xlim = range(c(-2.5,2.5)),main='Histogram of residuals',xlab='Standardised residuals',ylab='Frequency')
#Check the assumptions using plot(reg2, which= ... ).
#Fitted values and residuals plot to check the assumption of homoscedasticity.
plot(reg2, which = 1)

########### Variance inflation factors ##########.
#If you wish to Variance Inflation Factors you must load the library car.
library(car)
#If this command does not work, you will need to go to the Packages --> Install package(s) and select the UK (London)CRAN mirror.
#Then look for the package 'car' and click.  A lot of extra menus will download. Then try library(car) again.
#Note: This may not work if you are using Rstudio on an institutional computer without administrator rights.

#Calculate the VIF for each variable.
vif(reg2)

#Alternatively, try the usdm package which will also produce VIF figures.
#Load the library usdm.
library(usdm)
#select the independent variables and format as a data frame.
independents<-data.frame(cbind(Gestation,smoker,mppwt))
#request the VIF scores for the independent variables.
vif(independents)

#######################################################################.
###########  Further regression in R commands ######################.

#If you wish to carry out the Durbin Watson Statistic for autocorrelation you must load the library car.
library(car)
#If this command does not work, you will need to go to the Packages --> Install package(s) and select the UK (London)CRAN mirror.
#Then look for the package 'car' and click.  A lot of extra menus will download. Then try library(car) again.
#Request the Durbin Watson test
dwt(reg2)
#If there is no autocorrelation (where subsequent observations are related), the p-value will be above 0.05.

#Note:Some institutional computers restrict the R packages particularly within Rstudio so the Durbin Watson may not run.

#If you want to investigate if there are any influential observations, produce the following charts.
#First tell R you want two charts in one window.
par(mfrow=c(2,1))

#To produce a bar chart of Cook's distance for each individual.  
plot(reg2, which = 4)
#R identifies observation with Cooks > 4/n where n = number of observations.

#To produce a scatterplot of Leverage Values against standardised residuals.  
plot(reg2, which = 5)
#Leverage values 3 times (k + 1)/ n are large, where k = number of independent variables.

###########  Interaction terms ########.

#Model without an interaction term.
reg3<-lm(Birthweight~Gestation+smoker)
summary(reg3)

#To add an interaction term for gestation and smoker to the model.
reg4<-lm(Birthweight~Gestation*smoker)
summary(reg4)

#To include ONLY the interaction term for gestation and smoker in the model.
reg5<-lm(Birthweight~Gestation:smoker)
summary(reg5)



