###########################
#
#	Introduction to R sofware
#	25th May, 2015


#####	Aim of this short introduction is to 
#####	get you familiar with R-language syntax
#####	and to learn you some basic statistics
#####	using R-software	

#####	As you can see "#" can be used to comment

#####	You can run a single line using 
#####	ctrl+R (Windows) and cmd+return (Mac)

#####	Or you can highlight sections of code and 
#####	then use the key combination

#####	OBS: R is a case-sensitive language!


#############	Mathematical Operations

#####	Let's start with simple calculations
1+1
3*5

#####	Vector is a chain of values, in R usually 
#####	created with c()-funtion

?c()

c(1,2,3,4,5)
1:5
seq(1,5)

?seq()

#####	All these result in same vector

#####	Arithmetic operations can be applied to
#####	vectors as well

c(1,2,3,4)+c(5,6,7,8)
1:4+5:8
seq(1,4)+seq(5,8)

#####	In which case it applies the operation
#####	cell-wise, here 1+5, 2+6, etc.

#####	All arithmetic operations in R are vectorized:

1:5*2
c(1,2,3,4,5)*c(2,2,2,2,2)

#####	R also contains a wide selection of 
#####	mathematical functions

sum(1:5)
exp(5)
log(20)			#natural logarithm

#####	Boolean values:
#####	TRUE and FALSE or just T and F
#####	"&" is and
#####	"|" is or
#####	"!" is not

1<5
1:5<5:1
1==2
1!=2

T
F

#####	Assigning values to objects: "<-" or "="

x<-2
y=1
x
print(y)
x+y

x <- 3
x < -3
x<-3

#####	Variable names can contain letters, numbers, 
#####	dots, and underscores, but they can’t start#####	with a number, or a dot followed by a number 
#####	(since that looks too much like a number).#####	Reserved words like “if ” and “for” are not allowed.	

#####	Special values: Inf, -Inf, NaN, and NA
#####	Inf and -Inf are positive and negative infinity, 

#####	NaN is short for “not-a-number,” and means that 
#####	our calculation either didn’t make mathematical 
#####	sense or could not be performed properly. 
#####	NA is short for “not available” and represents a missing 
#####	value—a problem all too common in data analysis.

#####	Missing value NA

is.na(c(1,2,NA,4,5))
!is.na(c(1,2,NA,4,5))

####	Missing values in arithmetic operations:

mean(c(1,2,NA,4,5))
?mean()
mean(c(1,2,NA,4,5),na.rm=T)

##############	Classes

class(sqrt(1:10))class(3 + 1i) #"i" creates imaginary components of complex numbers

class(1) #although this is a whole number, it has class numeric

class(1L) #add a suffix of "L" to make the number an integerclass(0.5:4.5) #the colon operator returns a value that is numeric...class(1:5) #unless all its values are whole numbers

class(c("she", "sells", "seashells", "on", "the", "sea", "shore"))

gender <- factor(c("male", "female", "female", "male", "female"))
class(gender)
gender
as.integer(gender)
as.numeric(gender)

mean(gender)

#############	Workspace

ls()

#############	Indexing vectors

x<-c(1:5)^2
x
x[c(1,3,5)]
x[c(-2,-4)]
x[-c(2,4)]

names(x) <- c("one", "four", "nine", "sixteen", "twenty five")
x[c("one", "nine", "twenty five")]


#############	Matrices 

?matrix()
a_matrix <- matrix(1:12,nrow = 4, #ncol = 3 works the samedimnames = list(c("one", "two", "three", "four"),c("ein", "zwei", "drei")))
a_matrix

dim(a_matrix)
rownames(a_matrix)
colnames(a_matrix)
dimnames(a_matrix)

#####	Indexing matrices

a_matrix[1,]
a_matrix[,3]
a_matrix[1,3]

#####	artihmetic operations work on matrices 
#####	with numeric values as well

a_matrix/2

#############	Data-frames

?data.frame()
a_data_frame <- data.frame(x = letters[1:5],y = rnorm(5),z = runif(5) > 0.5)
a_data_frame
str(a_data_frame)

#####	rnorm() takes random numbers from Normal(0,1) distribution
#####	runif takes random numbers from Uniform(0,1) distribution

#####	indexing data.frame is similar as with matrices

a_data_frame[2:3, -3]
a_data_frame[, c("x", "y")]

#####	Data-frame manipulation

t(a_data_frame)

another_data_frame <- data.frame( z = rlnorm(5), y = sample(5), 
x = letters[3:7])
a_data_frame
another_data_framerbind(a_data_frame, another_data_frame)cbind(a_data_frame, another_data_frame)

#####	Merging data-frames allows for matching
#####	data based on a specific column, here x

merge(a_data_frame, another_data_frame, by = "x")

#####	This excludes non-matching rows
#####	To include all data:

merge(a_data_frame, another_data_frame, by = "x", all=TRUE)


##############	Reading text files into R

test_data<-read.table("diabetes.txt", header=T, sep="\t", dec=".")

#####	File has header line, column separator is tabulator
#####	and decimal is dot
#####	description of the example dataset:
#####	http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/Cdiabetes.html

#####	Always start working with your data by exploring

summary(test_data)
str(test_data)
names(test_data)
dim(test_data)

#####	Exploring binary variables

table(test_data$gender)
table(test_data$diabetes)

#####	Exploring continuous variables and 
#####	variable normalization

plot(test_data$chol)
?qqnorm
qqnorm(test_data$chol)
qqnorm(log(test_data$chol))
plot(log(test_data$chol))
identify(log(test_data$chol))
test_data[63,]
plot(density(log(test_data$chol[-c(4,63,295)]),na.rm=T))

?shapiro.test()
shapiro.test(test_data$chol)
shapiro.test(log(test_data$chol))
shapiro.test(log(test_data$chol[-c(4,63,295)]))


#	Exercise: Explore and normalize the variable "hdl"


#####	Exploring correlations between variables

#	Binary
table(test_data[,c("gender","diabetes")])
?chisq.test()
chisq.test(test_data$diabetes,test_data$gender)

#	Exercise: Is there significant difference of prevalence
#	of diabetes between "locations" in this dataset?


#	Continous
plot(test_data[,c("chol","hdl")])
cor(test_data[,c("chol","hdl")])
?cor()
cor(test_data[,c("chol","hdl")],use="pairwise.complete.obs")
cor.test(test_data$chol,test_data$hdl,use="pairwise.complete.obs")

#	Exercise: Is there significant correlation between "hip" and "waist"


#	Both
boxplot(test_data$hip~test_data$gender)
t.test(test_data$hip~test_data$gender)

#	Exercise: Is there difference of the
#	waist circumference between diabetic and non-diabetic?