This is an R Markdown
supplement for the article Tutorial on survival
modelling with omics data (Z. Zhao et al. 2023).
The Cancer Genome Atlas (TCGA) provides an enormous collection of cancer data sets, including survival, clinical and multi-omics data.
We will use TCGA data to demonstrate:
The R/Bioconductor package TCGAbiolinks (Mounir 2019) provides a few functions to download and preprocess clinical and multi-omics data from the Genomic Data Commons (GDC) Data Portal for further analysis.
First we load all necessary libraries used in this tutorial except mlr3 libraries which will be
introduced later. Then we use function GDCquery_clinic()
from TCGAbiolinks package to query and download TCGA
survival and clinical data from multiple cancer types:
# load all libraries used in this tutorial except mlr3
library("TCGAbiolinks")
library("SummarizedExperiment")
library("DESeq2")
library("dplyr")
library("ggplot2")
library("survival")
library("survminer")
library("M3C")
library("glmnet")
library("plotmo")
library("grpreg")
library("SGL")
library("psbcGroup")
library("GGally")
library("BhGLM")
library("risksetROC")
library("riskRegression")
library("peperr")
library("c060")
library("rms")
library("survAUC")
library("regplot")
# download the clinical data and extract data for multiple cancers using GDC api method
cancer_types = c("TCGA-BLCA", "TCGA-BRCA", "TCGA-COAD", "TCGA-LIHC",
"TCGA-LUAD", "TCGA-PAAD", "TCGA-PRAD", "TCGA-THCA")
clin = NULL
for (i in seq_along(cancer_types)) {
tmp = TCGAbiolinks::GDCquery_clinic(project = cancer_types[i], type = "clinical")
clin = rbind(clin, tmp[, c("project", "submitter_id", "vital_status",
"days_to_last_follow_up", "days_to_death",
"age_at_diagnosis", "gender", "race",
"ethnicity", "ajcc_pathologic_t")])
}
# extract the observed time for each patient and use years as unit
clin$time = apply(clin[, c("days_to_death", "days_to_last_follow_up")], 1, max, na.rm = TRUE) / 365.25
clin$age = clin$age_at_diagnosis / 365.25
clin$status = clin$vital_status
clin = clin[, c("project", "submitter_id", "status", "time", "gender", "age", "race", "ethnicity")]
clin = clin[(clin$time > 0) & (clin$status %in% c("Alive", "Dead")), ]
# frequency table of the patients w.r.t. status, gender and ethnicity
clin %>%
count(status, gender, ethnicity) %>%
group_by(status) %>%
mutate(prop = prop.table(n))
# A tibble: 12 × 5
# Groups: status [2]
status gender ethnicity n prop
<chr> <chr> <chr> <int> <dbl>
1 Alive female hispanic or latino 75 0.0240
2 Alive female not hispanic or latino 1367 0.438
3 Alive female not reported 328 0.105
4 Alive male hispanic or latino 34 0.0109
5 Alive male not hispanic or latino 1041 0.334
6 Alive male not reported 276 0.0884
7 Dead female hispanic or latino 7 0.00809
8 Dead female not hispanic or latino 377 0.436
9 Dead female not reported 64 0.0740
10 Dead male hispanic or latino 10 0.0116
11 Dead male not hispanic or latino 327 0.378
12 Dead male not reported 80 0.0925
# censoring plot by cancer types
clin %>%
mutate(index=1:n()) %>%
ggplot(
aes(y = index, x = time, colour = project, shape = factor(status))) +
geom_segment(aes(x = time, y = index, xend = 0, yend = index)) +
geom_point() +
ggtitle("") +
labs(x="Years", y="Patients") +
scale_shape_discrete(name = "Status", labels = c("Censored","Dead")) +
scale_color_discrete(name = "Cancer",
labels = c("Bladder","Breast","Colon","Liver", "Lung adeno",
"Pancreatic", "Prostate","Thyroid")) +
theme(legend.position="top", legend.direction="vertical") +
guides(color = guide_legend(nrow = 2, byrow = TRUE))