#CORRELATION IN R SCRIPT FILE.
#Associated files: Birthweight.csv and Correlation in R worksheet.
##########################################################.

#STATSTUTOR COMMUNITY PROJECT.
#Sofia Maria Karadimitriou and Ellen Marshall, University of Sheffield.
#Reviewer: Jim Bull, University of Swansea.
#######################################################.

#Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR.  
#If your file is saved as a standard Excel file, save it as a csv file first.  
#You will need to change the command depending on where you have saved the file.
birthweightR<-read.csv("E:\\Birthweight reduced.csv",header=T,sep=",")

#Tell R we are using the birthweight dataset until further notice using attach.
#This means that 'Gestation' can be used instead of birthweightR$Gestation.
attach(birthweightR) 

#Plotting the relationship between two continuous variables.

#main='' gives the title.
#xlab'' controls the x axis label.
#ylab'' controls the y axis label.
#pch changes the shape of the scatter, pch=4 gives crosses.
#cex changes the size of the scatter.
#lwd changes the width of the outline.

plot(Gestation,Birthweight,main='Scatterplot of gestational age and birthweight',xlab='Gestation (weeks)',ylab='Birthweight(lbs)',lwd=2)
#Adding a regression line to the plot.
abline(lm(Birthweight~Gestation),col='red',lwd=2)
#The closer the scatter is to the line, the stronger the relationship is.

#Calculating Pearson's correlation coefficient gives a measure of the strength of a relationship.
#Calculating the correlation coefficient between gestational age and birthweight using the command cor().
cor(Birthweight,Gestation)

#The test for correlation tests the hypothesis that the correlation coefficient is 0.
cor.test(Birthweight,Gestation)
#Check that the variables are normally distributed.  
#Plot histogram for the birthweight of babies and gestational age next to each other..
par(mfrow=c(1,2))
hist(Birthweight,main='Histogram for birthweight',xlab='Birthweight')
hist(Gestation,main='Histogram for Gestational age',xlab='Gestational age at birth')

#The correlations between several variables can be displayed in a table using cbind().
#The round command rounds numbers to a specified number of decimals, for example rounded to 2dp.
round(cor(cbind(Birthweight,Gestation,mppwt,mheight)),2)

#If the data is ordinal or assumptions of Pearson's have not been met, 
# either Kendall's tau or Spearmans correlation can be used.
#Calculating Spearman's correlation coefficient between gestational age and birthweight using the command cor().
cor(Birthweight,Gestation,method="spearman")
#Calculating Kendall's tau (measures the degree to which a relationship is always positive or negative).
cor(Birthweight,Gestation,method="kendall")

