An example differential gene expression results table

This post contains the analysis steps used to create a differential gene expression results table generated from RNA-seq counts summarised using nf-core/rnaseq. The comparison was done between two conditions: normal versus (lung) cancer.

We will be using {edgeR}, so install it if you haven't already.

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("edgeR")

We will also be using packages from the tidyverse, so install that too if you haven't already.

install.packages("tidyverse")

The count table is available on Zenodo, which we can directly load into R.

my_url <- 'https://zenodo.org/records/13970886/files/rsem.merged.gene_counts.tsv?download=1'
my_file <- 'rsem.merged.gene_counts.tsv'

if(file.exists(my_file) == FALSE){
  download.file(url = my_url, destfile = my_file)
}

gene_counts <- read_tsv("rsem.merged.gene_counts.tsv", show_col_types = FALSE)
head(gene_counts)
# A tibble: 6 × 10
  gene_id   `transcript_id(s)` ERR160122 ERR160123 ERR160124 ERR164473 ERR164550
  <chr>     <chr>                  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
1 ENSG0000… ENST00000373020,E…        2         6         5      374       1637 
2 ENSG0000… ENST00000373031,E…       19        40        28        0          1 
3 ENSG0000… ENST00000371582,E…      268.      274.      429.     489        637 
4 ENSG0000… ENST00000367770,E…      360.      449.      566.     363.       606.
5 ENSG0000… ENST00000286031,E…      156.      185.      265.      85.4      312.
6 ENSG0000… ENST00000374003,E…       24        23        40     1181        423 
# ? 3 more variables: ERR164551 <dbl>, ERR164552 <dbl>, ERR164554 <dbl>

Next we'll create the metadata where we indicate the group a sample belongs to.

tibble::tribble(
  ~sample, ~run_id, ~group,
  "C2_norm", "ERR160122", "normal",
  "C3_norm", "ERR160123", "normal",
  "C5_norm", "ERR160124", "normal",
  "C1_norm", "ERR164473", "normal",
  "C1_cancer", "ERR164550", "cancer",
  "C2_cancer", "ERR164551", "cancer",
  "C3_cancer", "ERR164552", "cancer",
  "C5_cancer", "ERR164554", "cancer"
) -> my_metadata

my_metadata$group <- factor(my_metadata$group, levels = c('normal', 'cancer'))

To start using {edgeR} we need to first create a DGEList object. The required inputs for creating a DGEList object is the count table and a grouping factor.

gene_counts |>
  dplyr::select(starts_with("ERR")) |>
  mutate(across(everything(), as.integer)) |>
  as.matrix() -> gene_counts_mat

row.names(gene_counts_mat) <- gene_counts$gene_id

idx <- match(colnames(gene_counts_mat), my_metadata$run_id)
colnames(gene_counts_mat) <- my_metadata$sample[idx]

y <- DGEList(
  counts = gene_counts_mat,
  group = my_metadata$group[idx]
)

y
An object of class "DGEList"
$counts
                C2_norm C3_norm C5_norm C1_norm C1_cancer C2_cancer C3_cancer
ENSG00000000003       2       6       5     374      1637       650      1015
ENSG00000000005      19      40      28       0         1         0         0
ENSG00000000419     268     273     428     489       637       879      1157
ENSG00000000457     360     449     566     362       605       708       632
ENSG00000000460     155     184     264      85       312       239       147
                C5_cancer
ENSG00000000003       562
ENSG00000000005         0
ENSG00000000419       729
ENSG00000000457       478
ENSG00000000460       156
63135 more rows ...

$samples
           group lib.size norm.factors
C2_norm   normal  4441588            1
C3_norm   normal  5349291            1
C5_norm   normal  7613422            1
C1_norm   normal 15972904            1
C1_cancer cancer 22329493            1
C2_cancer cancer 29928921            1
C3_cancer cancer 24891206            1
C5_cancer cancer 23703606            1

Genes that are lowly expressed are removed and the library sizes are re-calculated.

keep <- rowSums(cpm(y) > 0.5) >= 2
y <- y[keep, , keep.lib.sizes=FALSE]

Next we will normalise for composition bias.

y <- normLibSizes(y)

Here's how the samples look on a Multi-Dimensional Scaling (MDS) plot.

plotMDS(y, plot = FALSE)$eigen.vectors[, 1:2] |>
  as.data.frame() |>
  cbind(my_metadata) |>
  dplyr::rename(`Eigenvector 1` = V1, `Eigenvector 2` = V2) |>
  ggplot(aes(`Eigenvector 1`, `Eigenvector 2`, colour = group, label = sample)) +
  geom_point(size = 2) +
  geom_text_repel(show.legend = FALSE) +
  theme_minimal() +
  ggtitle("MDS plot")

As recommended in the {edgeR} manual, we perform testing under the glm framework using the quasi-likelihood method.

The quasi-likelihood (QL) method is highly recommended for differential expression analyses of bulk RNA-seq data as it gives stricter error rate control by accounting for the uncertainty in dispersion estimation.

design <- model.matrix(~y$samples$group)
y <- estimateDisp(y, design, robust=TRUE)
fit <- glmQLFit(y, design, robust=TRUE)
res <- glmQLFTest(fit)
topTags(res)
Coefficient:  y$samples$groupcancer 
                    logFC    logCPM         F       PValue          FDR
ENSG00000289381 -7.411609 2.3413727 156.14405 3.485881e-09 0.0001304556
ENSG00000100985  5.744629 5.7690594 110.89086 1.418804e-06 0.0058527061
ENSG00000275332 -7.648126 2.5721238  59.27705 1.721074e-06 0.0058527061
ENSG00000220695 -9.132437 4.0374574  58.73122 1.982088e-06 0.0058527061
ENSG00000256422 -5.529837 0.5541825  54.99786 2.395813e-06 0.0058527061
ENSG00000151834 -8.035376 3.6441016  61.94292 2.733010e-06 0.0058527061
ENSG00000167910 -8.018047 3.4530411  55.86149 3.928929e-06 0.0058527061
ENSG00000226443 -5.742734 0.9948963  51.37638 4.209874e-06 0.0058527061
ENSG00000250696 -8.580216 3.8092969  52.88815 4.376155e-06 0.0058527061
ENSG00000196778 -8.809149 4.2377739  54.66541 4.439027e-06 0.0058527061

Results are stored in the res object.

class(res)
[1] "DGELRT"
attr(,"package")
[1] "edgeR"

Specifically, the results are in res$table.

head(res$table)
                     logFC   logCPM          F      PValue
ENSG00000000003  2.7275199 4.832838  4.2763383 0.068387013
ENSG00000000005 -7.0023947 0.540980 17.6315738 0.002159975
ENSG00000000419  0.1201226 5.339964  0.1140066 0.743147316
ENSG00000000457 -0.7079087 5.306090  3.3541835 0.099292833
ENSG00000000460 -0.8967285 3.947381  2.6625206 0.136187708
ENSG00000000938  1.5368400 5.597107  1.8649388 0.205160993

Finally we'll add the adjusted p-values and export as a CSV file.

res$table |>
  dplyr::mutate(adjusted_pvalue = p.adjust(PValue, method = "BH")) |>
  tibble::rownames_to_column("ensembl_gene_id") |>
  readr::write_csv(file = "data/13970886_edger_res.csv")

You can download 13970886_edger_res.csv from my GitHub repo.




Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.