########################### # # Introduction to R sofware # 25th May, 2015 ##### Aim of this short introduction is to ##### get you familiar with R-language syntax ##### and to learn you some basic statistics ##### using R-software ##### As you can see "#" can be used to comment ##### You can run a single line using ##### ctrl+R (Windows) and cmd+return (Mac) ##### Or you can highlight sections of code and ##### then use the key combination ##### OBS: R is a case-sensitive language! ############# Mathematical Operations ##### Let's start with simple calculations 1+1 3*5 ##### Vector is a chain of values, in R usually ##### created with c()-funtion ?c() c(1,2,3,4,5) 1:5 seq(1,5) ?seq() ##### All these result in same vector ##### Arithmetic operations can be applied to ##### vectors as well c(1,2,3,4)+c(5,6,7,8) 1:4+5:8 seq(1,4)+seq(5,8) ##### In which case it applies the operation ##### cell-wise, here 1+5, 2+6, etc. ##### All arithmetic operations in R are vectorized: 1:5*2 c(1,2,3,4,5)*c(2,2,2,2,2) ##### R also contains a wide selection of ##### mathematical functions sum(1:5) exp(5) log(20) #natural logarithm ##### Boolean values: ##### TRUE and FALSE or just T and F ##### "&" is and ##### "|" is or ##### "!" is not 1<5 1:5<5:1 1==2 1!=2 T F ##### Assigning values to objects: "<-" or "=" x<-2 y=1 x print(y) x+y x <- 3 x < -3 x<-3 ##### Variable names can contain letters, numbers, ##### dots, and underscores, but they can’t start ##### with a number, or a dot followed by a number ##### (since that looks too much like a number). ##### Reserved words like “if ” and “for” are not allowed. ##### Special values: Inf, -Inf, NaN, and NA ##### Inf and -Inf are positive and negative infinity, ##### NaN is short for “not-a-number,” and means that ##### our calculation either didn’t make mathematical ##### sense or could not be performed properly. ##### NA is short for “not available” and represents a missing ##### value—a problem all too common in data analysis. ##### Missing value NA is.na(c(1,2,NA,4,5)) !is.na(c(1,2,NA,4,5)) #### Missing values in arithmetic operations: mean(c(1,2,NA,4,5)) ?mean() mean(c(1,2,NA,4,5),na.rm=T) ############## Classes class(sqrt(1:10)) class(3 + 1i) #"i" creates imaginary components of complex numbers class(1) #although this is a whole number, it has class numeric class(1L) #add a suffix of "L" to make the number an integer class(0.5:4.5) #the colon operator returns a value that is numeric... class(1:5) #unless all its values are whole numbers class(c("she", "sells", "seashells", "on", "the", "sea", "shore")) gender <- factor(c("male", "female", "female", "male", "female")) class(gender) gender as.integer(gender) as.numeric(gender) mean(gender) ############# Workspace ls() ############# Indexing vectors x<-c(1:5)^2 x x[c(1,3,5)] x[c(-2,-4)] x[-c(2,4)] names(x) <- c("one", "four", "nine", "sixteen", "twenty five") x[c("one", "nine", "twenty five")] ############# Matrices ?matrix() a_matrix <- matrix( 1:12, nrow = 4, #ncol = 3 works the same dimnames = list(c("one", "two", "three", "four"), c("ein", "zwei", "drei")) ) a_matrix dim(a_matrix) rownames(a_matrix) colnames(a_matrix) dimnames(a_matrix) ##### Indexing matrices a_matrix[1,] a_matrix[,3] a_matrix[1,3] ##### artihmetic operations work on matrices ##### with numeric values as well a_matrix/2 ############# Data-frames ?data.frame() a_data_frame <- data.frame( x = letters[1:5], y = rnorm(5), z = runif(5) > 0.5 ) a_data_frame str(a_data_frame) ##### rnorm() takes random numbers from Normal(0,1) distribution ##### runif takes random numbers from Uniform(0,1) distribution ##### indexing data.frame is similar as with matrices a_data_frame[2:3, -3] a_data_frame[, c("x", "y")] ##### Data-frame manipulation t(a_data_frame) another_data_frame <- data.frame( z = rlnorm(5), y = sample(5), x = letters[3:7] ) a_data_frame another_data_frame rbind(a_data_frame, another_data_frame) cbind(a_data_frame, another_data_frame) ##### Merging data-frames allows for matching ##### data based on a specific column, here x merge(a_data_frame, another_data_frame, by = "x") ##### This excludes non-matching rows ##### To include all data: merge(a_data_frame, another_data_frame, by = "x", all=TRUE) ############## Reading text files into R test_data<-read.table("diabetes.txt", header=T, sep="\t", dec=".") ##### File has header line, column separator is tabulator ##### and decimal is dot ##### description of the example dataset: ##### http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/Cdiabetes.html ##### Always start working with your data by exploring summary(test_data) str(test_data) names(test_data) dim(test_data) ##### Exploring binary variables table(test_data$gender) table(test_data$diabetes) ##### Exploring continuous variables and ##### variable normalization plot(test_data$chol) ?qqnorm qqnorm(test_data$chol) qqnorm(log(test_data$chol)) plot(log(test_data$chol)) identify(log(test_data$chol)) test_data[63,] plot(density(log(test_data$chol[-c(4,63,295)]),na.rm=T)) ?shapiro.test() shapiro.test(test_data$chol) shapiro.test(log(test_data$chol)) shapiro.test(log(test_data$chol[-c(4,63,295)])) # Exercise: Explore and normalize the variable "hdl" ##### Exploring correlations between variables # Binary table(test_data[,c("gender","diabetes")]) ?chisq.test() chisq.test(test_data$diabetes,test_data$gender) # Exercise: Is there significant difference of prevalence # of diabetes between "locations" in this dataset? # Continous plot(test_data[,c("chol","hdl")]) cor(test_data[,c("chol","hdl")]) ?cor() cor(test_data[,c("chol","hdl")],use="pairwise.complete.obs") cor.test(test_data$chol,test_data$hdl,use="pairwise.complete.obs") # Exercise: Is there significant correlation between "hip" and "waist" # Both boxplot(test_data$hip~test_data$gender) t.test(test_data$hip~test_data$gender) # Exercise: Is there difference of the # waist circumference between diabetic and non-diabetic?