read.csv('data/your_data.csv', sep = ',')List of commands
To help you navigate the course material
Data
Data import
Create variables
Create a numeric variable
a <- 3
a # return the value in console
return(a)Numeric, character, logical variables
class(a)
b <- 'hadley'
class(b)
c <- TRUE
class(c)Data structure (vector, matrix)
Create vectors and matrices
num_vector <- c(1, 2, 3, 4, 5)
char_vector <- c('student_a', 'student_b', 'student_c')
logical_vector <- c(T, F, T, F)
# matrix
matrix_1 <- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2, byrow = T)
# matrix by combining vectors
vec1 <- c(1, 2)
vec2 <- c(3, 4)
matrix_c <- cbind(vec1, vec2) # bind by columnn
matrix_r <- rbind(vec2, vec2) # bind by rowData exploration of a data.frame
Create a data.frame
mini_data <- data.frame(
age = c(20, 50, 32),
sex = c('male', 'female', 'male'),
has_covid = c(T, T, F)
)Get the column (feature) names, dimension, number of rows (observation) and columns
colnames(mini_data)
dim(mini_data)
nrow(mini_data)
ncol(mini_data)Select a variable (age) from the data
mini_data$age
mini_data['age']
mini_data[, 1] # first column, which is 'age'Filter a variable based on another (for example, age for females (sex == 'female'))
mini_data$age[mini_data$sex == 'female']
# you can also break down the process:
age <- mini_data$age
sex <- mini_data$sex
age[sex == 'female']Descriptive statistics
Continuous variables
# continuous variable x
summary(x)
min(x)
max(x)
mean(x)
median(x)
quantile(x, 0.95)
IQR(x) # interquartile rangeCategorical variables: count and percentage
# continuous variable z
# subjects per category in x
table(x)
# percentage
table(x)/length(x) Visualisation
We let x, y be two continuous variables, and z be categorical. To create histogram, boxplot, scatterplot, you can use the following commands,
hist(x) # histogram
boxplot(x) # boxplot
boxplot(x ~ z, data = data) # boxplot for two variables, where z is categorical
plot(x,y) # scatter plot of x, yHypothesis tests
t-test
# one sample (default tests against 0, conf.level 0.95)
t.test(x)
# one sample
t.test(x, mu = your_value, conf.level = 0.95)
# paired samples
t.test(x1, x2, paired = T, conf.level = 0.95)
t.test(x1-x2, conf.level = 0.95) # equivalent to one sample
# two independent samples
t.test(x, y, conf.level = 0.95)
# check normal assumption
qqnorm(x)
qqline(x)z-test, chi-square tests and table analysis
# test proportion: whether 123 success in 1000 equals prob = 0.15
prop.test(x = 123, n = 1000, p = 0.15)
binom.test(x = 123, n = 1000, p = 0.15)
# create binary variable
# compare your continuous values against threshold
# assign "yes" to those higher; otherwise, assign "no"
high_value <- ifelse(your_values > threshold, "yes", "no")
# count each category
table(x) # x is categorical!
# cross tabulation (2 variables)
table(x,y) # x, y are categorical
# chi.squared test
# tb is a 2 by 2 table (matrix) with counts
chisq.test(tb)non-parametric methods
# median ci (with descTools package)
DescTools::MedianCI(x, conf.level = 0.95)
# one sample (paired samples) wilcoxon test (signed rank)
wilcox.test(x1, x2, paired = T)
# two sample (independent) wilcoxon test (rank sum)
wilcox.test(x, y, paired = F)Regression analysis
Linear regression
# univariate: y is dependent var, x,z are independent var
linear_model <- lm(y ~ x, data = your_data)
# multivarite
linear_model <- lm(y ~ x+z, data = your_data)
# model summary
summary(linear_model)
# model diagnostic plots
plot(linear_model)Logistic regression
# univariate: y is dependent var, x,z are independent var
# y needs to be either 0/1, or factors
logit_model <- glm(y ~ x, data = your_data, family = 'binomial')
# multivariate, summary and diagnostic are the same as linear modelSurvival analysis
# need package, survival
# install.packages('survival')
library(survival)
# fit kaplan-meier plot
km_fit <- survfit(Surv(lifetime, death) ~ 1)
plot(km_fit)
# survival probabilities at specific times
tme <- c(1, 2, 5) # time points
summary(km_fit, times = tme)
# log rank test (compare two genders)
km_fit_gender <- survfit(Surv(lifetime, death) ~ gender)
plot(km_fit_gender, col = c('blue', 'red'))
survdiff(Surv(lifetime, death) ~ gender)