List of commands

To help you navigate the course material

Data

Data import

read.csv('data/your_data.csv', sep = ',')

Create variables

Create a numeric variable

a <- 3
a  # return the value in console
return(a)

Numeric, character, logical variables

class(a)

b <- 'hadley'
class(b)

c <- TRUE
class(c)

Data structure (vector, matrix)

Create vectors and matrices

num_vector <- c(1, 2, 3, 4, 5)
char_vector <- c('student_a', 'student_b', 'student_c')
logical_vector <- c(T, F, T, F)

# matrix
matrix_1 <- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2, byrow = T)

# matrix by combining vectors
vec1 <- c(1, 2)
vec2 <- c(3, 4)

matrix_c <- cbind(vec1, vec2) # bind by columnn
matrix_r <- rbind(vec2, vec2) # bind by row

Data exploration of a `data.frame`

Create a data.frame

mini_data <- data.frame(
  age = c(20, 50, 32), 
  sex = c('male', 'female', 'male'), 
  has_covid = c(T, T, F)
)

Get the column (feature) names, dimension, number of rows (observation) and columns

colnames(mini_data)
dim(mini_data)
nrow(mini_data)
ncol(mini_data)

Select a variable (age) from the data

mini_data$age
mini_data['age']
mini_data[, 1] # first column, which is 'age'

Filter a variable based on another (for example, age for females (sex == 'female'))

mini_data$age[mini_data$sex == 'female']

# you can also break down the process:
age <- mini_data$age
sex <- mini_data$sex
age[sex == 'female']

Descriptive statistics

Continuous variables

# continuous variable x
summary(x)
min(x)
max(x)
mean(x)
median(x)
quantile(x, 0.95)
IQR(x) # interquartile range

Categorical variables: count and percentage

# continuous variable z
# subjects per category in x
table(x)
# percentage
table(x)/length(x)

Visualisation

We let x, y be two continuous variables, and z be categorical. To create histogram, boxplot, scatterplot, you can use the following commands,

hist(x) # histogram
boxplot(x) # boxplot 
boxplot(x ~ z, data = data) # boxplot for two variables, where z is categorical
plot(x,y)  # scatter plot of x, y

Hypothesis tests

t-test

# one sample (default tests against 0, conf.level 0.95)
t.test(x)

# one sample
t.test(x, mu = your_value, conf.level = 0.95)

# paired samples
t.test(x1, x2, paired = T, conf.level = 0.95)
t.test(x1-x2, conf.level = 0.95) # equivalent to one sample

# two independent samples
t.test(x, y, conf.level = 0.95)

# check normal assumption
qqnorm(x)
qqline(x)

z-test, chi-square tests and table analysis

# test proportion: whether 123 success in 1000 equals prob = 0.15
prop.test(x = 123, n = 1000, p = 0.15)
binom.test(x = 123, n = 1000, p = 0.15)

# create binary variable
# compare your continuous values against threshold
# assign "yes" to those higher; otherwise, assign "no"
high_value <- ifelse(your_values > threshold, "yes", "no")

# count each category
table(x) # x is categorical! 

# cross tabulation (2 variables)
table(x,y) # x, y are categorical

# chi.squared test
# tb is a 2 by 2 table (matrix) with counts
chisq.test(tb)

non-parametric methods

# median ci (with descTools package)
DescTools::MedianCI(x, conf.level = 0.95)

# one sample (paired samples) wilcoxon test (signed rank)
wilcox.test(x1, x2, paired = T)

# two sample (independent) wilcoxon test (rank sum)
wilcox.test(x, y, paired = F)

Regression analysis

Linear regression

# univariate: y is dependent var, x,z are independent var
linear_model <- lm(y ~ x, data = your_data)

# multivarite
linear_model <- lm(y ~ x+z, data = your_data)

# model summary
summary(linear_model)

# model diagnostic plots
plot(linear_model)

Logistic regression

# univariate: y is dependent var, x,z are independent var
# y needs to be either 0/1, or factors
logit_model <- glm(y ~ x, data = your_data, family = 'binomial')

# multivariate, summary and diagnostic are the same as linear model

Survival analysis

# need package, survival
# install.packages('survival')
library(survival)

# fit kaplan-meier plot
km_fit <- survfit(Surv(lifetime, death) ~ 1)
plot(km_fit)

# survival probabilities at specific times
tme <- c(1, 2, 5) # time points
summary(km_fit, times = tme)

# log rank test (compare two genders)
km_fit_gender <- survfit(Surv(lifetime, death) ~ gender)
plot(km_fit_gender, col = c('blue', 'red'))
survdiff(Surv(lifetime, death) ~ gender)