This post contains the analysis steps used to create a differential gene expression results table generated from RNA-seq counts summarised using nf-core/rnaseq. The comparison was done between two conditions: normal versus (lung) cancer.
We will be using {edgeR}, so install it if you haven't already.
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("edgeR")
We will also be using packages from the tidyverse, so install that too if you haven't already.
install.packages("tidyverse")
The count table is available on Zenodo, which we can directly load into R.
my_url <- 'https://zenodo.org/records/13970886/files/rsem.merged.gene_counts.tsv?download=1'
my_file <- 'rsem.merged.gene_counts.tsv'
if(file.exists(my_file) == FALSE){
download.file(url = my_url, destfile = my_file)
}
gene_counts <- read_tsv("rsem.merged.gene_counts.tsv", show_col_types = FALSE)
head(gene_counts)
# A tibble: 6 × 10
gene_id `transcript_id(s)` ERR160122 ERR160123 ERR160124 ERR164473 ERR164550
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ENSG0000… ENST00000373020,E… 2 6 5 374 1637
2 ENSG0000… ENST00000373031,E… 19 40 28 0 1
3 ENSG0000… ENST00000371582,E… 268. 274. 429. 489 637
4 ENSG0000… ENST00000367770,E… 360. 449. 566. 363. 606.
5 ENSG0000… ENST00000286031,E… 156. 185. 265. 85.4 312.
6 ENSG0000… ENST00000374003,E… 24 23 40 1181 423
# ? 3 more variables: ERR164551 <dbl>, ERR164552 <dbl>, ERR164554 <dbl>
Next we'll create the metadata where we indicate the group a sample belongs to.
tibble::tribble(
~sample, ~run_id, ~group,
"C2_norm", "ERR160122", "normal",
"C3_norm", "ERR160123", "normal",
"C5_norm", "ERR160124", "normal",
"C1_norm", "ERR164473", "normal",
"C1_cancer", "ERR164550", "cancer",
"C2_cancer", "ERR164551", "cancer",
"C3_cancer", "ERR164552", "cancer",
"C5_cancer", "ERR164554", "cancer"
) -> my_metadata
my_metadata$group <- factor(my_metadata$group, levels = c('normal', 'cancer'))
To start using {edgeR} we need to first create a DGEList
object. The required inputs for creating a DGEList
object is the count table and a grouping factor.
gene_counts |>
dplyr::select(starts_with("ERR")) |>
mutate(across(everything(), as.integer)) |>
as.matrix() -> gene_counts_mat
row.names(gene_counts_mat) <- gene_counts$gene_id
idx <- match(colnames(gene_counts_mat), my_metadata$run_id)
colnames(gene_counts_mat) <- my_metadata$sample[idx]
y <- DGEList(
counts = gene_counts_mat,
group = my_metadata$group[idx]
)
y
An object of class "DGEList"
$counts
C2_norm C3_norm C5_norm C1_norm C1_cancer C2_cancer C3_cancer
ENSG00000000003 2 6 5 374 1637 650 1015
ENSG00000000005 19 40 28 0 1 0 0
ENSG00000000419 268 273 428 489 637 879 1157
ENSG00000000457 360 449 566 362 605 708 632
ENSG00000000460 155 184 264 85 312 239 147
C5_cancer
ENSG00000000003 562
ENSG00000000005 0
ENSG00000000419 729
ENSG00000000457 478
ENSG00000000460 156
63135 more rows ...
$samples
group lib.size norm.factors
C2_norm normal 4441588 1
C3_norm normal 5349291 1
C5_norm normal 7613422 1
C1_norm normal 15972904 1
C1_cancer cancer 22329493 1
C2_cancer cancer 29928921 1
C3_cancer cancer 24891206 1
C5_cancer cancer 23703606 1
Genes that are lowly expressed are removed and the library sizes are re-calculated.
keep <- rowSums(cpm(y) > 0.5) >= 2
y <- y[keep, , keep.lib.sizes=FALSE]
Next we will normalise for composition bias.
y <- normLibSizes(y)
Here's how the samples look on a Multi-Dimensional Scaling (MDS) plot.
plotMDS(y, plot = FALSE)$eigen.vectors[, 1:2] |>
as.data.frame() |>
cbind(my_metadata) |>
dplyr::rename(`Eigenvector 1` = V1, `Eigenvector 2` = V2) |>
ggplot(aes(`Eigenvector 1`, `Eigenvector 2`, colour = group, label = sample)) +
geom_point(size = 2) +
geom_text_repel(show.legend = FALSE) +
theme_minimal() +
ggtitle("MDS plot")
As recommended in the {edgeR} manual, we perform testing under the glm framework using the quasi-likelihood method.
The quasi-likelihood (QL) method is highly recommended for differential expression analyses of bulk RNA-seq data as it gives stricter error rate control by accounting for the uncertainty in dispersion estimation.
design <- model.matrix(~y$samples$group)
y <- estimateDisp(y, design, robust=TRUE)
fit <- glmQLFit(y, design, robust=TRUE)
res <- glmQLFTest(fit)
topTags(res)
Coefficient: y$samples$groupcancer
logFC logCPM F PValue FDR
ENSG00000289381 -7.411609 2.3413727 156.14405 3.485881e-09 0.0001304556
ENSG00000100985 5.744629 5.7690594 110.89086 1.418804e-06 0.0058527061
ENSG00000275332 -7.648126 2.5721238 59.27705 1.721074e-06 0.0058527061
ENSG00000220695 -9.132437 4.0374574 58.73122 1.982088e-06 0.0058527061
ENSG00000256422 -5.529837 0.5541825 54.99786 2.395813e-06 0.0058527061
ENSG00000151834 -8.035376 3.6441016 61.94292 2.733010e-06 0.0058527061
ENSG00000167910 -8.018047 3.4530411 55.86149 3.928929e-06 0.0058527061
ENSG00000226443 -5.742734 0.9948963 51.37638 4.209874e-06 0.0058527061
ENSG00000250696 -8.580216 3.8092969 52.88815 4.376155e-06 0.0058527061
ENSG00000196778 -8.809149 4.2377739 54.66541 4.439027e-06 0.0058527061
Results are stored in the res
object.
class(res)
[1] "DGELRT"
attr(,"package")
[1] "edgeR"
Specifically, the results are in res$table
.
head(res$table)
logFC logCPM F PValue
ENSG00000000003 2.7275199 4.832838 4.2763383 0.068387013
ENSG00000000005 -7.0023947 0.540980 17.6315738 0.002159975
ENSG00000000419 0.1201226 5.339964 0.1140066 0.743147316
ENSG00000000457 -0.7079087 5.306090 3.3541835 0.099292833
ENSG00000000460 -0.8967285 3.947381 2.6625206 0.136187708
ENSG00000000938 1.5368400 5.597107 1.8649388 0.205160993
Finally we'll add the adjusted p-values and export as a CSV file.
res$table |>
dplyr::mutate(adjusted_pvalue = p.adjust(PValue, method = "BH")) |>
tibble::rownames_to_column("ensembl_gene_id") |>
readr::write_csv(file = "data/13970886_edger_res.csv")
You can download 13970886_edger_res.csv
from my GitHub repo.

This work is licensed under a Creative Commons
Attribution 4.0 International License.