read.csv('data/your_data.csv', sep = ',')
List of commands
To help you navigate the course material
Data
Data import
Create variables
Create a numeric variable
<- 3
a # return the value in console
a return(a)
Numeric, character, logical variables
class(a)
<- 'hadley'
b class(b)
<- TRUE
c class(c)
Data structure (vector, matrix)
Create vectors and matrices
<- c(1, 2, 3, 4, 5)
num_vector <- c('student_a', 'student_b', 'student_c')
char_vector <- c(T, F, T, F)
logical_vector
# matrix
<- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2, byrow = T)
matrix_1
# matrix by combining vectors
<- c(1, 2)
vec1 <- c(3, 4)
vec2
<- cbind(vec1, vec2) # bind by columnn
matrix_c <- rbind(vec2, vec2) # bind by row matrix_r
Data exploration of a data.frame
Create a data.frame
<- data.frame(
mini_data age = c(20, 50, 32),
sex = c('male', 'female', 'male'),
has_covid = c(T, T, F)
)
Get the column (feature) names, dimension, number of rows (observation) and columns
colnames(mini_data)
dim(mini_data)
nrow(mini_data)
ncol(mini_data)
Select a variable (age
) from the data
$age
mini_data'age']
mini_data[1] # first column, which is 'age' mini_data[,
Filter a variable based on another (for example, age
for females (sex == 'female'
))
$age[mini_data$sex == 'female']
mini_data
# you can also break down the process:
<- mini_data$age
age <- mini_data$sex
sex == 'female'] age[sex
Descriptive statistics
Continuous variables
# continuous variable x
summary(x)
min(x)
max(x)
mean(x)
median(x)
quantile(x, 0.95)
IQR(x) # interquartile range
Categorical variables: count and percentage
# continuous variable z
# subjects per category in x
table(x)
# percentage
table(x)/length(x)
Visualisation
We let x, y
be two continuous variables, and z
be categorical. To create histogram, boxplot, scatterplot, you can use the following commands,
hist(x) # histogram
boxplot(x) # boxplot
boxplot(x ~ z, data = data) # boxplot for two variables, where z is categorical
plot(x,y) # scatter plot of x, y
Hypothesis tests
t-test
# one sample (default tests against 0, conf.level 0.95)
t.test(x)
# one sample
t.test(x, mu = your_value, conf.level = 0.95)
# paired samples
t.test(x1, x2, paired = T, conf.level = 0.95)
t.test(x1-x2, conf.level = 0.95) # equivalent to one sample
# two independent samples
t.test(x, y, conf.level = 0.95)
# check normal assumption
qqnorm(x)
qqline(x)
z-test, chi-square tests and table analysis
# test proportion: whether 123 success in 1000 equals prob = 0.15
prop.test(x = 123, n = 1000, p = 0.15)
binom.test(x = 123, n = 1000, p = 0.15)
# create binary variable
# compare your continuous values against threshold
# assign "yes" to those higher; otherwise, assign "no"
<- ifelse(your_values > threshold, "yes", "no")
high_value
# count each category
table(x) # x is categorical!
# cross tabulation (2 variables)
table(x,y) # x, y are categorical
# chi.squared test
# tb is a 2 by 2 table (matrix) with counts
chisq.test(tb)
non-parametric methods
# median ci (with descTools package)
::MedianCI(x, conf.level = 0.95)
DescTools
# one sample (paired samples) wilcoxon test (signed rank)
wilcox.test(x1, x2, paired = T)
# two sample (independent) wilcoxon test (rank sum)
wilcox.test(x, y, paired = F)
Regression analysis
Linear regression
# univariate: y is dependent var, x,z are independent var
<- lm(y ~ x, data = your_data)
linear_model
# multivarite
<- lm(y ~ x+z, data = your_data)
linear_model
# model summary
summary(linear_model)
# model diagnostic plots
plot(linear_model)
Logistic regression
# univariate: y is dependent var, x,z are independent var
# y needs to be either 0/1, or factors
<- glm(y ~ x, data = your_data, family = 'binomial')
logit_model
# multivariate, summary and diagnostic are the same as linear model
Survival analysis
# need package, survival
# install.packages('survival')
library(survival)
# fit kaplan-meier plot
<- survfit(Surv(lifetime, death) ~ 1)
km_fit plot(km_fit)
# survival probabilities at specific times
<- c(1, 2, 5) # time points
tme summary(km_fit, times = tme)
# log rank test (compare two genders)
<- survfit(Surv(lifetime, death) ~ gender)
km_fit_gender plot(km_fit_gender, col = c('blue', 'red'))
survdiff(Surv(lifetime, death) ~ gender)