Variance in RNA-Seq data

Updated 2014 April 18th

For this post I will use data from this study, that has been nicely summarised already to examine the variance in RNA-Seq data. Briefly, the study used LNCaP cells, which are androgen-sensitive human prostate adenocarcinoma cells, and treated the cells with DHT and with a mock treatment as the control. The poly-A RNAs were captured and sequenced on 7 lanes. Let's get started:

#I host this file on my server for convenience
file_url <- 'http://davetang.org/eg/pnas_expression.txt'
data <- read.table(file_url, header=T, sep="\t", stringsAsFactors=F, row.names=1)

#check out the first 6 lines of data
head(data)
#                lane1 lane2 lane3 lane4 lane5 lane6 lane8  len
#ENSG00000215696     0     0     0     0     0     0     0  330
#ENSG00000215700     0     0     0     0     0     0     0 2370
#ENSG00000215699     0     0     0     0     0     0     0 1842
#ENSG00000215784     0     0     0     0     0     0     0 2393
#ENSG00000212914     0     0     0     0     0     0     0  384
#ENSG00000212042     0     0     0     0     0     0     0   92

#what are the dimensions of the dataset?
dim(data)
#[1] 37435     8

#as we can see above
#many of the genes have zero expression
#let's filter them out
data_subset <- data[rowSums(data[,-8])>0,]
dim(data_subset)
#[1] 21877     8

At this point, we some expression from at least one lane for 21,877 genes. The first four lanes, i.e., columns 1-4, are the mock treated controls and lanes 5, 6, and 8, i.e., columns 5-7, are the DHT treated cells. The length of the genes is stored as the 8th column.

We can perform a simple counts per million normalisation:

#I'm not going to use the len column
data_subset_cpm <- data_subset[,-8]

#check out the column sums
colSums(data_subset_cpm)
#  lane1   lane2   lane3   lane4   lane5   lane6   lane8 
# 978576 1156844 1442169 1485604 1823460 1834335  681743 

#function for normalisation
norm_cpm <- function(x){
  x * 1000000 / sum(x)
}

#apply the normalisation
data_subset_cpm <- apply(data_subset_cpm, 2, norm_cpm)
head(data_subset_cpm)
#                    lane1     lane2    lane3      lane4     lane5     lane6     lane8
#ENSG00000124208 488.46487 535.07647 435.4552 500.806406 264.88105 390.33219 352.03882
#ENSG00000182463  27.59111  17.28842  18.7218  17.501299  26.32358  29.98362  35.20388
#ENSG00000124201 183.94075 188.44373 203.1662 185.109895 204.55617 164.09216 129.08090
#ENSG00000124205   0.00000   0.00000   3.4670   3.365634   0.00000   0.00000   0.00000
#ENSG00000124207  77.66387  69.15366  58.9390  65.293308  43.87264  44.15769  54.27265
#ENSG00000125835 134.88988 172.88416 138.6800 153.472931 153.55423 111.21197  76.27508

#sanity check
colSums(data_subset_cpm)
#lane1 lane2 lane3 lane4 lane5 lane6 lane8 
#1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06

After normalising simply by counts per million, we can calculate the mean and variance of the gene expression per group:

mock_var <- apply(data_subset_cpm[,1:4], 1, var)
mock_mean <- apply(data_subset_cpm[,1:4], 1, mean)
dht_var <- apply(data_subset_cpm[,5:7], 1, var)
dht_mean <- apply(data_subset_cpm[,5:7], 1, mean)

par(mfrow=c(1,2))
plot(density(mock_mean))
plot(density(dht_mean))

density_meanFew genes are very highly expressed.

And plotting the variance:

par(mfrow=c(1,2))
plot(density(mock_var))
plot(density(dht_var))
par(mfrow=c(1,1))

density_varThe gene with the highest variance in the DHT treatment samples is ten fold higher than the gene with the highest variance in the mock treatment samples.

Let's plot mean against variance:

#correlation of mean and variance
cor(mock_mean, mock_var, method="spearman")
#[1] 0.9353861
cor(dht_mean, dht_var, method="spearman")
#[1] 0.9139193

par(mfrow=c(1,2))
plot(log2(mock_mean), log2(mock_var), pch='.')
plot(log2(dht_mean), log2(dht_var), pch='.')
par(mfrow=c(1,1))

mean_vs_varThe mean and variance of gene expression is correlated; the higher the expression, the higher the variance.

How does the mean versus variance plot look for counts that follow a Poisson distribution?

set.seed(31)
sim <- t(sapply(dht_mean, rpois, n=4))
head(sim)
#                [,1] [,2] [,3] [,4]
#ENSG00000124208  336  332  333  363
#ENSG00000182463   28   48   28   21
#ENSG00000124201  153  171  171  175
#ENSG00000124205    0    0    0    0
#ENSG00000124207   42   36   50   47
#ENSG00000125835   91  114  138  113

sim_mean <- apply(sim, 1, mean)
sim_var <- apply(sim, 1, var)
cor(sim_mean, sim_var, method="spearman")
#[1] 0.9446539

plot(log2(sim_mean), log2(sim_var), pch='.')

sim_mean_vs_varThe mean versus the variance scatter plot of data simulated from a Poisson distribution is more symmetrical than real data.

Let's calculate the tagwise dispersion of our data using edgeR:

#install edgeR
source("http://bioconductor.org/biocLite.R")
biocLite('edgeR')
#load library
library(edgeR)

d <- data[rowSums(data[,-8])>0,-8]
d <- d[,1:4]
group <- c(rep("mock",4))
d <- DGEList(counts = d, group=group)
d <- calcNormFactors(d)
d <- estimateCommonDisp(d, verbose=T)
#Disp = 0.01932 , BCV = 0.139
d <- estimateTagwiseDisp(d)

#the range of values for the tagwise dispersion
summary(d$tagwise.dispersion)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0003019 0.0215100 0.0555400 0.1560000 0.2044000 1.2320000

cor(d$AveLogCPM, log2(mock_mean), method="spearman")
[1] 0.9996904
#plotting my log2 mean expression against edgeR's calculation
plot(d$AveLogCPM, log2(mock_mean), pch='.')

mock_mean_vs_ave_log_cpmSanity check between edgeR's average log2 cpm calculations versus our own.

Let's plot the tagwise dispersion against the average log2 CPM:

plot(d$AveLogCPM, d$tagwise.dispersion, pch='.')

ave_log_cpm_vs_tagwise_dispThe higher the expression, the lower the tagwise dispersion.

Note that we had observed higher variances with genes that were more higher expressed. So tagwise dispersion is not simply the variance.

sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252    LC_MONETARY=English_Australia.1252
[4] LC_NUMERIC=C                       LC_TIME=English_Australia.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] edgeR_3.6.0          limma_3.20.1         BiocInstaller_1.14.1

loaded via a namespace (and not attached):
[1] tools_3.1.0

DESeq vs. edgeR vs. baySeq using pnas_expression.txt

Following the instructions from a previous post, I filtered the pnas_expression.txt dataset and saved the results in "pnas_expression_filtered.tsv" and then performed the differential gene expression analyses using the respective packages.

To run the Perl scripts below, just save the code into a file and name it "something.pl". Then make the file executable by running "chmod 755 something.pl". To execute the file in your current directory "./something.pl".

The input file (pnas_expression_filtered.tsv) looks like this:

ensembl_ID lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208 478 619 628 744 483 716 240
ENSG00000182463 27 20 27 26 48 55 24
ENSG00000124201 180 218 293 275 373 301 88

And finally please refer to the respective Bioconductor packages for more information.

sessionInfo()

sessionInfo()
R version 2.15.0 (2012-03-30)
Platform: x86_64-pc-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=C                 LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] snow_0.3-8         baySeq_1.10.0      DESeq_1.8.0        locfit_1.5-7      
[5] Biobase_2.16.0     BiocGenerics_0.2.0 edgeR_2.6.0        limma_3.12.0      

loaded via a namespace (and not attached):
 [1] annotate_1.34.0      AnnotationDbi_1.18.0 DBI_0.2-5           
 [4] genefilter_1.38.0    geneplotter_1.34.0   grid_2.15.0         
 [7] IRanges_1.14.2       lattice_0.20-6       RColorBrewer_1.0-5  
[10] RSQLite_0.11.1       splines_2.15.0       stats4_2.15.0       
[13] survival_2.36-12     xtable_1.7-0

DESeq

library("DESeq")
countsTable <- read.delim("pnas_expression_filtered.tsv", header=TRUE, row.names=1)
conds <- c(rep("C",4),rep("T",3))
cds <- newCountDataSet (countsTable, conds)
cds <- estimateSizeFactors(cds)
sizeFactors(cds)
cds <- estimateDispersions(cds)
str(fitInfo(cds))

plotDispEsts <- function(cds){
   plot(rowMeans(counts(cds, normalized=TRUE)),
   fitInfo(cds)$perGeneDispEsts,
   pch = '.', log="xy")
   xg <- 10^seq(-.5,5,length.out=300)
   lines(xg,fitInfo(cds)$dispFun(xg),col="red")
}

plotDispEsts(cds)

res <- nbinomTest(cds,"C","T")

head(res)

plotDE <- function(res)
   plot(res$baseMean,res$log2FoldChange,log="x",pch=20,cex=.3,col=ifelse(res$padj < .1, "red","black"))

plotDE(res)

hist(res$pval, breaks=100, col="skyblue", border="slateblue", main="")

resSig <- res[res$padj < 0.1,]

head( resSig[order(resSig$pval),])

write.table (res, file="results_deseq.txt")

For parsing the results_deseq.txt, I wrote this script, which outputs results with a adjusted p-value of 0.05 or less and sorted by the adjusted p-value:

#!/usr/bin/perl

use strict;
use warnings;

my $usage = "Usage: $0 <deseq.results>\n";
my $infile = shift or die $usage;

my %result = ();
my $counter = '0';

open(IN,'<',$infile) || die "Could not open $infile: $!\n";
while(<IN>){
   chomp;

   #"id" "baseMean" "baseMeanA" "baseMeanB" "foldChange" "log2FoldChange" "pval" "padj"
   #"1" "ENSG00000124208" 529.399951591274 598.268222888992 437.575589860984 0.731403696736499 -0.451260176370596 0.00157794968191143 0.0126404575296003

   if (/^\"id/){
      next;
   }
   my ($junk,$id,$base_mean,$base_mean_a,$base_mean_b,$fold_change,$log_2_fold_change,$pval,$padj) = split(/\s+/);
   next unless $padj <= 0.05;
   ++$counter;
   $result{$counter}->{'all'} = $_;
   $result{$counter}->{'padj'} = $padj;
}
close(IN);

foreach my $num (sort {$result{$a}->{'padj'} <=> $result{$b}->{'padj'}} keys %result){
   print $result{$num}->{'all'},"\n";
}

exit(0);

__END__

edgeR

library(edgeR)
data <- read.table("pnas_expression_filtered.tsv", header=TRUE, row.names=1)

d <- data[,1:7]
rownames(d) <- row.names(data)

group <- c(rep("Control",4),rep("Test",3))
d <- DGEList(counts = d, group=group)
dim(d)
d <- calcNormFactors(d)
d
d <- estimateCommonDisp(d)
d$common.dispersion
sqrt(d$common.dispersion)
de.com <- exactTest(d)
summary(decideTestsDGE(de.com,p.value=0.01))
summary(decideTestsDGE(de.com,p.value=0.05))
com = summary(decideTestsDGE(de.com,p.value=0.05))
com_total = com[1] + com[3]

sink("de_common_dispersion.txt")
topTags(de.com,n=com_total)
sink()

getPriorN(d)
d <- estimateTagwiseDisp(d, prop.used=0.5, grid.length=500)
png(file="tgw_disp_vs_log_conc.png", height=1200, width=1200)
plot(log2(d$conc$conc.common),d$tagwise.dispersion,panel.first=grid())
abline(h=d$common.dispersion,col="dodgerblue",lwd=3)
dev.off()
de.tgw <- exactTest(d)
topTags(de.tgw)
summary(decideTestsDGE(de.tgw, p.value=0.01))
summary(decideTestsDGE(de.tgw, p.value=0.05))
bob = summary(decideTestsDGE(de.tgw,p.value=0.05))
total = bob[1] + bob[3]

sink("de_tagwise_dispersion.txt")
topTags(de.tgw,n=total)
sink()

baySeq

library(baySeq)
data <- read.delim("pnas_expression_filtered.tsv",header=T)
replicates <- c(1,1,1,1,2,2,2)
groups <- list(NDE = c(1,1,1,1,1,1,1),DE = c(1,1,1,1,2,2,2))
cname <- data[,1]
data <- data[,-1]
data = as.matrix(data)
CD <- new("countData", data = data, replicates = replicates, groups = groups)
CD@libsizes <- getLibsizes(CD)
plotMA.CD(CD, samplesA = "1", samplesB = "2", col = c(rep("red", 100), rep("black", 900)))
library(snow)
cl <- makeCluster(8, "SOCK")
CD@annotation <- as.data.frame(cname)
CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = cl)
CD <- getLikelihoods.NB(CD, pET = 'BIC', cl = cl)
CD@estProps
topCounts(CD, group = "DE")
blah <- topCounts(CD,group="DE",FDR=0.05)
write.csv(blah, file="bayseq_de.csv")

Results

Number of statistically significant (adjusted p-value of 0.05 or less) differentially expressed genes

DESeq: 2,796
edgeR (tagwise): 4,453
baySeq: 2,847

To compare results, I wrote this Perl script:

#!/usr/bin/perl

#Compares two lists and reports entries in both files

use strict;
use warnings;
my $usage = "Usage $0 <firstList> <secondList> <in or out>\n";
my $first = shift or die $usage;
my $second = shift or die $usage;
my $option = shift or die $usage;

die $usage unless $option =~ /[inout]+/i;

my %firstList = ();
my %secondList = ();

open (IN, $first) or die "Can't open $first\n";
while (<IN>){
   chomp;
   $firstList{$_} = $_;
}
close (IN);

open (IN, $second) or die "Can't open $second\n";
while (<IN>){
   chomp;
   $secondList{$_} = $_;
}
close (IN);

if ($option =~ /^[in]+/i){
   for my $blah (keys %firstList){
      if (exists $secondList{$blah}){
         print "$blah is in both lists\n";
      }
   }
} elsif ($option =~ /^[out]+/i){
   foreach my $first (keys %firstList){
      print "<$first\n" unless exists $secondList{$first};
   }
   foreach my $second (keys %secondList){
      print ">$second\n" unless exists $firstList{$second};
   }
}

Store the top 100 DE genes from the respective packages:

parse_deseq.pl results_deseq.txt | cut -f2 -d ' ' | sed 's/"//g;' | head -100 > deseq.list
cat de_tagwise_dispersion.txt | grep -v "Comparison" | grep -v "logFC" | awk '{print $1}' | head -100 > edger.list
cat bayseq_de.csv | grep -v "cname" | cut -f2 -d',' | sed 's/"//g' | head -100 > bayseq.list

Comparing the top 100 differentially expressed genes

compare_list.pl deseq.list edger.list in | wc -l
88
compare_list.pl deseq.list bayseq.list in | wc -l
65
compare_list.pl edger.list bayseq.list in | wc -l
60

When comparing the top 100 (ranked by adjusted p-values) list from DESeq and edgeR, we see an overlap of 88 / 100 of the candidates. DESeq and baySeq had an overlap of 65 / 100 and edgeR and baySeq had an overlap of 60 / 100.

Store the top 500 DE genes from the respective packages:

parse_deseq.pl results_deseq.txt | cut -f2 -d ' ' | sed 's/"//g;' | head -500 > deseq_500.list
cat de_tagwise_dispersion.txt | grep -v "Comparison" | grep -v "logFC" | awk '{print $1}' | head -500 > edger_500.list
cat bayseq_de.csv | grep -v "cname" | cut -f2 -d',' | sed 's/"//g' | head -500 > bayseq_500.list

Comparing the top 500 differentially expressed genes

compare_list.pl deseq_500.list edger_500.list in | wc -l
450
compare_list.pl deseq_500.list bayseq_500.list in | wc -l
424
compare_list.pl edger_500.list bayseq_500.list in | wc -l
440

When comparing the top 500 (ranked by adjusted p-values) list from DESeq and edgeR, we see an overlap of 450 / 500 of the candidates. DESeq and baySeq had an overlap of 424 / 500 and edgeR and baySeq had an overlap of 440 / 500.

For comparison with a non-parametric method, see this post.

edgeR vs. SAMSeq

A while ago I received a comment on comparing parametric methods against nonparametric for calling differential expression in count data. Here I compare SAMSeq (Jun Li and Robert Tibshirani (2011) Finding consistent patterns: a nonparametric approach for identifying differential expression in RNA-Seq data. Statistical Methods in Medical Research, in press.) with edgeR. For more information on SAMSeq refer to the paper. I intend to look at cuffdiff sometime later.

library(edgeR)
raw.data <- read.delim("pnas_expression_filtered.tsv",row.names=1)
d <- raw.data[,1:7]
rownames(d) <- rownames(raw.data)
head(d)
#                lane1 lane2 lane3 lane4 lane5 lane6 lane8
#ENSG00000124208   478   619   628   744   483   716   240
#ENSG00000182463    27    20    27    26    48    55    24
#ENSG00000124201   180   218   293   275   373   301    88
#ENSG00000124207    76    80    85    97    80    81    37
#ENSG00000125835   132   200   200   228   280   204    52
#ENSG00000125834    42    60    72    86   131    99    30
group <- c(rep("Control",4),rep("DHT",3))
d <- DGEList(counts = d, group=group)
#Calculating library sizes from column totals.
d <- calcNormFactors(d)
d <- estimateCommonDisp(d)
de.com <- exactTest(d)
#Comparison of groups:  DHT - Control
summary(decideTestsDGE(de.com,p.value=0.05))
#   [,1]
#-1  2368
#0  11715
#1   2411
d <- estimateTagwiseDisp(d, prop.used=0.5, grid.length=500)
de.tgw <- exactTest(d)
summary(decideTestsDGE(de.tgw, p.value=0.05))
#   [,1]
#-1  2119
#0  12048
#1   2327
#total 4446
de <- summary(decideTestsDGE(de.tgw,p.value=0.05))
total <- de[1] + de[3]
sink("edger.output")
topTags(de.tgw,n=total)
sink()
sessionInfo()
#R version 2.14.1 (2011-12-22)
#Platform: x86_64-unknown-linux-gnu (64-bit)
#
#locale:
#[1] C
#
#attached base packages:
#[1] stats     graphics  grDevices utils     datasets  methods   base
#
#other attached packages:
#[1] edgeR_2.4.1
#
#loaded via a namespace (and not attached):
#[1] limma_3.10.0

For SAMSeq:

Data type? Two class unpaired
Array or Seq data? seq
Arrays centered? FALSE
Delta 2.21025
Minimum fold change 0
Test statistic standard
Are data are log scale? FALSE
Number of permutations 1000
Number of neighbors for KNN 10
Seed for Random number generator 1234567
Estimate of pi0 (proportion of null genes) 0.55
Exchangibility factor s0 0
s0 percentile 0
Quantiles of estimated sequencing depths (0%, 25%, 50%, 75%, 100%) 0.474, 0.822, 1.116, 1.228, 1.311
False Discovery Rate (%) 3.228881

Of the 4,408 significantly differentiated genes detected by SAMSeq using the settings above, 3,908 overlap the 4,446 detected by edgeR using a tagwise dispersion.

See also: edgeR vs. DESeq using this same dataset.

and

http://www.stanford.edu/~junli07/research.html#SAM

The DGEList object in R

I've updated this post (2013 June 29th) to use the latest version of R, Bioconductor and edgeR. I also demonstrate how results of edgeR can be saved and outputted into one useful table.

The DGEList object holds the dataset to be analysed by edgeR and the subsequent calculations performed on the dataset. Specifically it contains:

counts numeric matrix containing the read counts.
lib.size numeric vector containing the total to normalize against for each sample (optional)
norm.factors numeric vector containing normalization factors (optional, defaults to all 1)
group vector giving the experimental group/condition for each sample/library
genes data frame containing annotation information for the tags/transcripts/genes for which we have count data (optional).
remove.zeros whether to remove rows that have 0 total count; default is FALSE so as to retain all information in the dataset

After calling the function estimateCommonDisp the DGEList object contains several new elemenets. Straight from the manual:

The output of estimateCommonDisp is a DGEList object with several new elements. The element common.dispersion, as the name suggests, provides the estimate of the common dispersion, and pseudo.alt gives the pseudocounts calculated under the alternative hypothesis. The element genes contains the information about gene/tag identifiers. The element conc gives the estimates of the overall concentration of each tag across all of the original samples (conc$conc.common) and the estimate of the concentration of each tag within each group (conc$conc.group). The element common.lib.size gives the library size to which the original libraries have been adjusted in the pseudocounts.

Here I use edgeR for a differential gene expression analysis and show how I access the DGEList and the results of the exact test and save it all into an expanded data frame:

#install DESeq if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("DESeq")
#load DESeq
library("DESeq")
example_file <- system.file ("extra/TagSeqExample.tab", package="DESeq")
data <- read.delim(example_file, header=T, row.names="gene")
head(data)
dim(data)
[1] 18760     6
head(data)
           T1a T1b  T2  T3  N1  N2
Gene_00001   0   0   2   0   0   1
Gene_00002  20   8  12   5  19  26
Gene_00003   3   0   2   0   0   0
Gene_00004  75  84 241 149 271 257
Gene_00005  10  16   4   0   4  10
Gene_00006 129 126 451 223 243 149
summary(rowSums(data))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      31     228    1521     883  234500
#let's get rid of some lowly expressed genes
data_subset <- data[rowSums(data)>10,]
dim(data_subset)
[1] 15716     6
#classic edgeR for differential expression
library(edgeR)
group <- c(rep("Tumour",4),rep("Control",2))
#create the DGEList object
d <- DGEList(counts = data_subset, group=group)
d <- calcNormFactors(d)
d <- estimateCommonDisp(d,verbose=T)
Disp = 0.57717 , BCV = 0.7597
d <- estimateTagwiseDisp(d)
de.tgw <- exactTest(d)
summary(decideTestsDGE(de.tgw, p.value=0.01))
   [,1] 
-1    93
0  15587
1     36
topTags(de.tgw)
Comparison of groups:  Tumour-Control 
                logFC   logCPM       PValue          FDR
Gene_12457 -11.954146 5.205006 2.725899e-17 4.284023e-13
Gene_14803 -11.181823 4.410806 5.260892e-16 4.134009e-12
Gene_00665  -8.093491 4.579092 8.091297e-15 4.238761e-11
Gene_03846  -6.845510 5.869045 6.247258e-14 2.454548e-10
Gene_01354  -9.611609 6.619495 1.831463e-12 5.756654e-09
Gene_10880 -10.210789 6.010410 2.499662e-12 6.547448e-09
Gene_09044  -8.378516 4.605649 4.742492e-12 1.020125e-08
Gene_18549  -8.151739 3.708076 5.192798e-12 1.020125e-08
Gene_02555 -10.534767 3.859586 8.331151e-12 1.454804e-08
Gene_08310  -5.250630 6.059341 2.135419e-11 3.356024e-08
#now say if you wanted to know the tagwise dispersions for each gene
names(d)
[1] "counts"             "samples"            "common.dispersion"  "pseudo.counts"      "AveLogCPM"          "pseudo.lib.size"   
[7] "prior.n"            "tagwise.dispersion"
data_subset$twd <- d$tagwise.dispersion
head(data_subset)
           T1a T1b  T2  T3  N1  N2       twd
Gene_00002  20   8  12   5  19  26 0.7054258
Gene_00004  75  84 241 149 271 257 0.3119328
Gene_00005  10  16   4   0   4  10 1.3231080
Gene_00006 129 126 451 223 243 149 0.3076355
Gene_00007  13   4  21  19  31   4 0.6682897
Gene_00009 202 122 256  43 287 357 0.4313053
#plot the tagwise dispersions
#histogram at the end of the post
hist(data_subset$twd, breaks=20, xlim=c(0,3))

#now if you wanted to save the fold changes and p values per gene
names(de.tgw)
[1] "table"      "comparison" "genes"
head(de.tgw$table)
                 logFC   logCPM    PValue
Gene_00002 -0.71266757 1.950463 0.4919477
Gene_00004 -0.90990666 5.280466 0.1732668
Gene_00005  0.59732915 1.126382 0.8207064
Gene_00006  0.30343120 5.532274 0.7148083
Gene_00007 -0.03886553 1.863605 0.9267819
Gene_00009 -0.84392775 5.626952 0.2764131
data_subset <- cbind(data_subset, de.tgw$table)
head(data_subset)
           T1a T1b  T2  T3  N1  N2       twd       logFC   logCPM    PValue
Gene_00002  20   8  12   5  19  26 0.7054258 -0.71266757 1.950463 0.4919477
Gene_00004  75  84 241 149 271 257 0.3119328 -0.90990666 5.280466 0.1732668
Gene_00005  10  16   4   0   4  10 1.3231080  0.59732915 1.126382 0.8207064
Gene_00006 129 126 451 223 243 149 0.3076355  0.30343120 5.532274 0.7148083
Gene_00007  13   4  21  19  31   4 0.6682897 -0.03886553 1.863605 0.9267819
Gene_00009 202 122 256  43 287 357 0.4313053 -0.84392775 5.626952 0.2764131
#I want the fdr
data_subset$PValue_fdr <- p.adjust(method="fdr",p=data_subset$PValue)
head(data_subset)
           T1a T1b  T2  T3  N1  N2       twd       logFC   logCPM    PValue PValue_fdr
Gene_00002  20   8  12   5  19  26 0.7054258 -0.71266757 1.950463 0.4919477  1.0000000
Gene_00004  75  84 241 149 271 257 0.3119328 -0.90990666 5.280466 0.1732668  0.8566228
Gene_00005  10  16   4   0   4  10 1.3231080  0.59732915 1.126382 0.8207064  1.0000000
Gene_00006 129 126 451 223 243 149 0.3076355  0.30343120 5.532274 0.7148083  1.0000000
Gene_00007  13   4  21  19  31   4 0.6682897 -0.03886553 1.863605 0.9267819  1.0000000
Gene_00009 202 122 256  43 287 357 0.4313053 -0.84392775 5.626952 0.2764131  0.9593879
#sanity check with the decideTestsDGE() call
table(data_subset$PValue_fdr<0.01)

FALSE  TRUE 
15587   129
#to write out this useful data frame
write.table(data_subset, file="blah.tsv", quote=F)
sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252    LC_MONETARY=English_Australia.1252
[4] LC_NUMERIC=C                       LC_TIME=English_Australia.1252    

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] edgeR_3.2.3        limma_3.16.5       DESeq_1.12.0       lattice_0.20-15    locfit_1.5-9.1     Biobase_2.20.0     BiocGenerics_0.6.0

loaded via a namespace (and not attached):
 [1] annotate_1.38.0      AnnotationDbi_1.22.5 DBI_0.2-7            genefilter_1.42.0    geneplotter_1.38.0   grid_3.0.1          
 [7] IRanges_1.18.1       RColorBrewer_1.0-5   RSQLite_0.11.4       splines_3.0.1        stats4_3.0.1         survival_2.37-4     
[13] tools_3.0.1          XML_3.96-1.1         xtable_1.7-1

typical_tagwise_dispersions

edgeR vs. DESeq using pnas_expression.txt

Firstly from Davis's homepage download the file pnas_expression.txt. For more information on the dataset please refer to the edgeR manual and this paper.

The latest R version at the time of writing is R 2.13.1. You can download it from here.

Install bioconductor and the required packages:

source(“http://www.bioconductor.org/biocLite.R”)
biocLite()
biocLite(“DESeq”)
biocLite(“edgeR”)

A filtering criteria of keeping "only those tags that have at least one count per million in at least three samples" was specified in the edgeR manual. As I'm no good with R, I will preprocess the pnas_expression.txt file with Perl imposing the same filter and also removing the "len" column.

#!/usr/bin/perl

use strict;
use warnings;

my $infile = 'pnas_expression.txt';

open(IN,'<',$infile) || die "Could not open $infile: $!\n";

my @library_size = ();

while(<IN>){
   chomp;
   next if /^ensembl_ID/;
   #ensembl_ID      lane1   lane2   lane3   lane4   lane5   lane6   lane8   len
   #ENSG00000215696 0       0       0       0       0       0       0       330
   my @line_split = split(/\t/);
   for (my $i = 1; $i<8; ++$i){
      $library_size[$i] += $line_split[$i];
   }
}
close(IN);

open(IN,'<',$infile) || die "Could not open $infile: $!\n";
while(<IN>){
   chomp;
   my @line_split = split(/\t/);
   pop(@line_split);
   if (/^ensembl_ID/){
      print join("\t",@line_split),"\n";
      next;
   }
   my $checkpoint = '0';
   for (my $i = 1; $i<8; ++$i){
      my $tpm = $line_split[$i] * 1_000_000 / $library_size[$i];
      ++$checkpoint if $tpm >= 1;
   }
   print join ("\t",@line_split),"\n" if $checkpoint >= 3;
}
close(IN);

exit(0);

__END__

filter.pl > pnas_expression_filtered.tsv

library(edgeR)
raw.data <- read.delim("pnas_expression_filtered.tsv")
d <- raw.data[,2:8]
rownames(d) <- raw.data[,1]
group <- c(rep("Control",4),rep("DHT",3))
d <- DGEList(counts = d, group=group)
dim(d)
[1] 16494     7
d <- calcNormFactors(d)
d
An object of class "DGEList"
$samples
        group lib.size norm.factors
lane1 Control   976847    1.0296636
lane2 Control  1154746    1.0372521
lane3 Control  1439393    1.0362662
lane4 Control  1482652    1.0378383
lane5     DHT  1820628    0.9537095
lane6     DHT  1831553    0.9525624
lane8     DHT   680798    0.9583181

$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208   478   619   628   744   483   716   240
ENSG00000182463    27    20    27    26    48    55    24
ENSG00000124201   180   218   293   275   373   301    88
ENSG00000124207    76    80    85    97    80    81    37
ENSG00000125835   132   200   200   228   280   204    52
16489 more rows ...

$all.zeros
ENSG00000124208 ENSG00000182463 ENSG00000124201 ENSG00000124207 ENSG00000125835 
          FALSE           FALSE           FALSE           FALSE           FALSE 
16489 more elements ...

plotMDS.dge(d, main = "MDS Plot for Li Data", xlim = c(-1, 1),labels = c("Control1","Control2","Control3","Control4","DHT1", "DHT2", "DHT3"))

On dimension 2, DHT3 is quite different from the other DHT samples. It should be noted that the library size of DHT3 is three times smaller than DHT1 and DHT2.

Meanwhile in DESeq:

library("DESeq")
countsTable <- read.delim("pnas_expression_filtered.tsv", header=TRUE, stringsAsFactors=TRUE)
rownames(countsTable) <- countsTable$ensembl_ID
countsTable <- countsTable[,-1]
conds <- c("N","N","N","N","T","T","T")
cds <- newCountDataSet (countsTable, conds)
cds <- estimateSizeFactors(cds)
sizeFactors(cds)
    lane1     lane2     lane3     lane4     lane5     lane6     lane8 
0.7911762 0.9433534 1.1885656 1.2310312 1.4100653 1.4225280 0.5140726

For more information on how DESeq calculates the size factors, see this post. I attempted to calculate these values manually by following their instructions and I got 0.726441658 0.87339311 1.103958663 1.141646857 1.276470588 1.285666056 0.449085824. Slightly off, perhaps due to Excel or more likely due to something I did wrong.

On the other hand, edgeR has these normalisation factors:

group lib.size norm.factors
lane1 Control 976847 1.0296636
lane2 Control 1154746 1.0372521
lane3 Control 1439393 1.0362662
lane4 Control 1482652 1.0378383
lane5 DHT 1820628 0.9537095
lane6 DHT 1831553 0.9525624
lane8 DHT 680798 0.9583181

See this post for more information on the norm.factors.

Before calling differential expression on any of the genes, it is important to estimate the variance. Both methods use the negative binomial distribution to model the data as the level of noise is beyond what the Poisson model can account for due to biological variance.

Using edgeR to calculate the variance

d <- estimateCommonDisp(d)
d$samples$lib.size
[1]  976847 1154746 1439393 1482652 1820628 1831553  680798
d$common.lib.size
[1] 1274589
colSums(d$pseudo.alt)
  lane1   lane2   lane3   lane4   lane5   lane6   lane8 
1237946 1228854 1230106 1228297 1336795 1338335 1331880 
d$common.dispersion
[1] 0.01999041
sqrt(d$common.dispersion)
[1] 0.1413875

The common dispersion is the "squared coefficient of variation", where the coefficient of variation gives the amount of variability in the true abundance for each gene between replicates. The common dispersion estimate is ~0.02, which (take square root) gives an estimate of the coefficient of variation of 0.14. The true abundance for each gene can vary up or down by 14% between replicates.

Using DESeq:

cds <- estimateVarianceFunctions(cds)
scvPlot(cs,ylim=c(0,2))

The x axis is the base mean, calculated using the size factors and the y axis is the squared coefficient of variation, the ratio of the variance at base level to the square of the base mean. The solid lines are the SCV for the raw variances. The dotted line is the base variance, calculated using the size factors. The solid black line is a density estimate of the base means; only where there is an appreciable number of base mean values, the variance estimates can be expected to be accurate. Observe the decrease in variance as the base mean of the genes increases.

diagForT <-  varianceFitDiagnostics(cds,"T")
head(diagForT)
                 baseMean    baseVar fittedRawVar fittedBaseVar    pchisq
ENSG00000124208 437.57559 7106.70216   2133.97917    2623.68628 0.9333747
ENSG00000182463  39.79685   40.93749     50.62900      95.16714 0.3495977
ENSG00000124201 215.76798 2191.36827    668.79704     910.27101 0.9099491
ENSG00000124207  61.88337   76.38028     96.49555     165.75153 0.3692287
ENSG00000125835 147.71069 2386.52531    366.69439     532.00292 0.9887335
ENSG00000125834  73.61848  310.50106    125.27327     207.66245 0.7758007
smoothScatter (log10(diagForT$baseMean), log10(diagForT$baseVar))
lines (log10(fittedBaseVar) ~ log10(baseMean), diagForT[order(diagForT$baseMean),],col="red")
residualsEcdfPlot(cds,"T")
residualsEcdfPlot(cds,"N")

The fitted base variance is the predicted value from the local fit through the base variance estimates from all genes.

The base mean is proportional to the base variance until there is high biological variability with abundantly expressed genes.

The pchisq column from the varianceFitDiagnostics() function is used to determine whether single-gene estimate of the base variance deviates too much from the fitted value (which is calculated by taking the variance from all genes into account). The above plots show an empirical cumulative density function (ECDF) of the pchisq values stratified by base level. Basically if the curves are above the green line, where variance is overestimated, there is nothing to worry about.

Calling DE using edgeR and using a common dispersion

de.com <- exactTest(d)
Comparison of groups:  DHT - Control 
topTags(de.com)
Comparison of groups: DHT-Control 
                  logConc    logFC       P.Value           FDR
ENSG00000151503 -11.93733 5.821695 6.831306e-193 1.126756e-188
ENSG00000096060 -11.32227 5.009483 1.311104e-162 1.081267e-158
ENSG00000127954 -15.62219 8.235552 2.461375e-153 1.353264e-149
ENSG00000166451 -12.27691 4.686616 1.108080e-134 4.569166e-131
ENSG00000131016 -14.41810 5.306943 4.156214e-110 1.371052e-106
ENSG00000113594 -12.82287 4.116740 5.371074e-102  1.476508e-98
ENSG00000116285 -13.55664 4.204769  4.673617e-93  1.101238e-89
ENSG00000123983 -12.08499 3.660756  1.263258e-92  2.604521e-89
ENSG00000166086 -15.23487 5.507542  1.925054e-90  3.527982e-87
ENSG00000162772 -10.80630 3.318369  7.379002e-86  1.217093e-82
summary(decideTestsDGE(de.com, p.value=0.01))
   [,1] 
-1  1621
0  13110
1   1763
summary(decideTestsDGE(de.com, p.value=0.05))
   [,1] 
-1  2467
0  11555
1   2472

At an adjusted p-value of 0.05, 4,939 are differentially expressed.

d <- estimateTagwiseDisp(d,prior.n=10,trend=TRUE, prop.used=0.3, grid.length = 500)
summary(d$tagwise.dispersion)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.009082 0.021450 0.040580 0.062720 0.097690 0.383100
plot(log2(d$conc$conc.common), d$tagwise.dispersion, panel.first = grid())
abline(h=d$common.dispersion, col="dodgerblue",lwd=3)

Dispersion decreases with tag concentration.

de.tgw <- exactTest (d,common.disp=FALSE)
Comparison of groups:  DHT - Control 
topTags(de.tgw)
Comparison of groups: DHT-Control 
                   logConc    logFC       P.Value           FDR
ENSG00000151503 -11.936602 5.820621 6.372940e-312 1.051153e-307
ENSG00000096060 -11.321444 5.007775 3.937429e-270 3.247198e-266
ENSG00000166451 -12.278413 4.686846 1.314101e-213 7.224927e-210
ENSG00000127954 -15.621048 8.234180 1.513030e-191 6.238980e-188
ENSG00000113594 -12.824797 4.115144 4.910925e-156 1.620016e-152
ENSG00000162772 -10.805702 3.317800 6.173781e-145 1.697172e-141
ENSG00000123983 -12.085611 3.657763 1.283904e-129 3.025245e-126
ENSG00000116133 -11.730415 3.245004 1.530499e-126 3.155506e-123
ENSG00000115648  -8.820711 2.598023 1.686581e-124 3.090942e-121
ENSG00000116285 -13.556232 4.206675 2.656142e-122 4.381040e-119
summary(decideTestsDGE(de.tgw,p.value=0.05))
   [,1] 
-1  2124
0  12054
1   2316
mv <- plotMeanVar(d, show.raw.vars = TRUE, show.tagwise.vars=TRUE,dispersion.method="qcml",NBline=TRUE)

See smooth scatter plot above.

Calling differential expression using DESeq

res <- nbinomTest( cds,"N","T")
resSig <- res[res$padj < 0.05,]
nrow(resSig)
[1] 4965

Taking the top 100 differentially expressed genes (ordered by p-value) from both lists there are only 88 overlaps. Taking the top 500, there are 450 overlaps.

edgeR's common dispersion

Updated: 2014 April 18th

A post showing edgeR's common dispersion values from a set of values generated from various distributions. For an explanation of the common dispersion value in edgeR, please refer to this thread.

The generated values represent the counts of reads, i.e., expression level, for a gene. Each row will represent one gene and I will generate counts for 20,000 genes. Each column represents a sample and I will have 4 samples. Therefore the common dispersion calculations will be based on a matrix that has 20,000 rows and 4 columns.

Firstly, let's generate the average expression of all genes using the negative binomial distribution:

#negative binomial distribution of mean expression
#n    = number of observations
#size = target for number of successful trials
#prob = probability of success in each trial
set.seed(31)
mean <- rnbinom(n=20000, size=1, prob=0.0001)

#on average it takes ~10,000 times to have one success
#this makes sense because I set the probability to 0.0001
summary(mean)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#      1    2803    6915    9898   13780  106300

plot(density(mean))

density_negative_binomialThis distribution is meant to represent the fact that few genes are very highly expressed.

What's the common dispersion when the expression of every gene is identical in 4 samples?

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite('edgeR')

#load library
library(edgeR)

#identical expression
identical <- matrix(rep(x=mean, each=4),
                    nrow=length(mean),
                    byrow=T)

head(identical)
#      [,1]  [,2]  [,3]  [,4]
#[1,]  5386  5386  5386  5386
#[2,]  4282  4282  4282  4282
#[3,]  2243  2243  2243  2243
#[4,] 12830 12830 12830 12830
#[5,] 11962 11962 11962 11962
#[6,] 24473 24473 24473 24473

group <- c(rep("Control",4))
d <- DGEList(counts =  identical, group=group)
d <- calcNormFactors(d)
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)

d$common.dispersion
#[1] 0.0001005378

summary(d$tagwise.dispersion)
#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#1.571e-06 1.571e-06 1.571e-06 1.571e-06 1.571e-06 1.571e-06

How about counts following a Poisson distribution:

set.seed(31)
poisson <- t(sapply(mean, rpois, n=4))
head(poisson)
#      [,1]  [,2]  [,3]  [,4]
#[1,]  5390  5372  5377  5496
#[2,]  4252  4138  4198  4200
#[3,]  2227  2164  2196  2263
#[4,] 12883 12918 12741 12643
#[5,] 12014 11962 11731 11970
#[6,] 24839 24475 24443 24436

group <- c(rep("Control",4))
d <- DGEList(counts =  poisson, group=group)
d <- calcNormFactors(d)
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)
d$common.dispersion
#[1] 0.0001005378
summary(d$tagwise.dispersion)
#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#1.571e-06 1.571e-06 1.571e-06 1.672e-05 9.963e-06 1.687e-03

The common dispersion between counts that were identical and counts that followed a Poisson distribution was identical.

Lastly, let's generate some counts using the negative binomial distribution:

#from the R documentation of rnbinom
#An alternative parametrization (often used in ecology) is by the mean mu,
#and size, the dispersion parameter, where prob = size/(size+mu).
#The variance is mu + mu^2/size in this parametrization.
set.seed(31)
nb <- t(mapply(rnbinom, n=rep(4, 20000), size=rep(10,20000), mu=mean))
head(nb)
#      [,1]  [,2]  [,3]  [,4]
#[1,]  5196  5024  4259  3149
#[2,]  3536  2888  4768  3010
#[3,]  2476   924  4058  1993
#[4,] 15688  8399  6963 12968
#[5,] 12321  8653 11610 11104
#[6,] 26923 27319 24340 22717

#calculate means
nb_mean <- apply(blah, 1, mean)

#the means of the generated counts correlate
#to the original means
cor(nb_mean, mean, method="spearman")
#[1] 0.9888302

group <- c(rep("Control",4))
d <- DGEList(counts =  nb, group=group)
d <- calcNormFactors(d)
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)
d$common.dispersion
#[1] 0.1001726
summary(d$tagwise.dispersion)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.07591 0.08641 0.09526 0.10010 0.10870 0.22200 

The common dispersion is higher when the read counts were generated from a negative binomial distribution. Lastly to understand the mu parameter of the rnbinom() function:

#mean mu
my_mu <- 500
my_size <- 1
my_n <- 1000
set.seed(31)
test <- rnbinom(n=my_n, size=my_size, mu=my_mu)
summary(test)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    0.0   149.8   338.0   498.2   715.0  3126.0 
var(test)
#[1] 245942.2
#dispersion
(my_mu + (my_mu^2/my_size))
#[1] 250500
#prob
my_size/(my_size+my_mu)
#[1] 0.001996008
plot(density(test))

nb_mu_500

The mean of the sampled read counts (498.2) is close what we specified as mu (500) and the variance of the sampled read counts (245942.2) is close to the dispersion parameter (my_mu + (my_mu^2/my_size)). The smaller the size parameter and the larger mu is, the larger the dispersion.

sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
[3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] MASS_7.3-31  edgeR_3.6.0  limma_3.20.1

loaded via a namespace (and not attached):
[1] tools_3.1.0

Normalisation methods implemented in edgeR

Updated 2014 December 12th

A short post on the different normalisation methods implemented within edgeR; to see the normalisation methods type:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("edgeR")
library(edgeR)
?calcNormFactors

From the documentation:

method="TMM" is the weighted trimmed mean of M-values (to the reference) proposed by Robinson and Oshlack (2010), where the weights are from the delta method on Binomial data. If refColumn is unspecified, the library whose upper quartile is closest to the mean upper quartile is used.

method="RLE" is the scaling factor method proposed by Anders and Huber (2010). We call it "relative log expression", as median library is calculated from the geometric mean of all columns and the median ratio of each sample to the median library is taken as the scale factor.

method="upperquartile" is the upper-quartile normalization method of Bullard et al (2010), in which the scale factors are calculated from the 75% quantile of the counts for each library, after removing transcripts which are zero in all libraries. This idea is generalized here to allow scaling by any quantile of the distributions.

If method="none", then the normalization factors are set to 1.

For symmetry, normalization factors are adjusted to multiply to 1. The effective library size is then the original library size multiplied by the scaling factor.

Note that rows that have zero counts for all columns are trimmed before normalization factors are computed. Therefore rows with all zero counts do not affect the estimated factors.

A test dataset

I created a dataset to test the different normalisation methods. There are four samples; column one and two are the controls and column three and four are the patients. 25 transcripts are in all four samples in equal amount. Another 25 transcripts are only present in the controls and their expression is the same as the first 25 transcripts in the controls. The four samples have exactly the same depth (500 counts). However, since the patient samples have half the number of transcripts than the controls (25 versus 50), they are sequenced at twice the depth. This hypothetical situation was described in Robinson and Oshlack.

#prepare example
control_1 <- rep(10, 50)
control_2 <- rep(10, 50)
patient_1 <- c(rep(20, 25),rep(0,25))
patient_2 <- c(rep(20, 25),rep(0,25))

df <- data.frame(c1=control_1,
                 c2=control_2,
                 p1=patient_1,
                 p2=patient_2)

head(df)
  c1 c2 p1 p2
1 10 10 20 20
2 10 10 20 20
3 10 10 20 20
4 10 10 20 20
5 10 10 20 20
6 10 10 20 20

tail(df)
   c1 c2 p1 p2
45 10 10  0  0
46 10 10  0  0
47 10 10  0  0
48 10 10  0  0
49 10 10  0  0
50 10 10  0  0

#equal depth
colSums(df)
 c1  c2  p1  p2 
500 500 500 500

No normalisation

Let's run the differential expression analysis without any normalisation step:

#load library
library(edgeR)

#create group vector
group <- c('control','control','patient','patient')

#create DGEList object
d <- DGEList(counts=df, group=group)

#check out the DGEList object
d
An object of class "DGEList"
$counts
  c1 c2 p1 p2
1 10 10 20 20
2 10 10 20 20
3 10 10 20 20
4 10 10 20 20
5 10 10 20 20
45 more rows ...

$samples
     group lib.size norm.factors
c1 control      500            1
c2 control      500            1
p1 patient      500            1
p2 patient      500            1

d <- DGEList(counts=df, group=group)
d <- estimateCommonDisp(d)

#perform the DE test
de <- exactTest(d)

#how many differentially expressed transcripts?
table(p.adjust(de$table$PValue, method="BH")<0.05)

TRUE 
  50

Without normalisation, every transcript is differentially expressed.

TMM normalisation

Let's test the weighted trimmed mean of M-values method:

TMM <- calcNormFactors(d, method="TMM")
TMM
An object of class "DGEList"
$counts
  c1 c2 p1 p2
1 10 10 20 20
2 10 10 20 20
3 10 10 20 20
4 10 10 20 20
5 10 10 20 20
45 more rows ...

$samples
     group lib.size norm.factors
c1 control      500    0.7071068
c2 control      500    0.7071068
p1 patient      500    1.4142136
p2 patient      500    1.4142136

If we use the scaling, for transcript 1 we would compare 10/0.7071068 (~14.14) to 20/1.4142136 (~14.14) and correctly observe that they are not differentially expressed. Let's run the differential expression test:

TMM <- estimateCommonDisp(TMM)
TMM <- exactTest(TMM)
table(p.adjust(TMM$table$PValue, method="BH")<0.05)

FALSE  TRUE 
   25    25

Only half of the transcripts are differentially expressed (the last 25 transcripts in the control samples).

RLE normalisation

Let's test the relative log expression normalisation method:

RLE <- calcNormFactors(d, method="RLE")
RLE
An object of class "DGEList"
$counts
  c1 c2 p1 p2
1 10 10 20 20
2 10 10 20 20
3 10 10 20 20
4 10 10 20 20
5 10 10 20 20
45 more rows ...

$samples
     group lib.size norm.factors
c1 control      500    0.7071068
c2 control      500    0.7071068
p1 patient      500    1.4142136
p2 patient      500    1.4142136
RLE <- estimateCommonDisp(RLE)
RLE <- exactTest(RLE)
table(p.adjust(RLE$table$PValue, method="BH")<0.05)

FALSE  TRUE 
   25    25

Upper-quartile normalisation

Lastly let's try the upper-quartile normalisation method:

uq <- calcNormFactors(d, method="upperquartile")
uq
An object of class "DGEList"
$counts
  c1 c2 p1 p2
1 10 10 20 20
2 10 10 20 20
3 10 10 20 20
4 10 10 20 20
5 10 10 20 20
45 more rows ...

$samples
     group lib.size norm.factors
c1 control      500    0.7071068
c2 control      500    0.7071068
p1 patient      500    1.4142136
p2 patient      500    1.4142136

uq <- estimateCommonDisp(uq)
uq <- exactTest(uq)
table(p.adjust(uq$table$PValue, method="BH")<0.05)

FALSE  TRUE 
   25    25

Testing a real dataset

require(RCurl)
my_file <- getURL("https://dl.dropboxusercontent.com/u/15251811/pnas_expression.txt")
data <- read.table(textConnection(my_file), header=T, sep="\t")

d <- data[,2:8]
rownames(d) <- data[,1]
group <- c(rep("Control",4),rep("DHT",3))
d <- DGEList(counts = d, group=group)

#no normalisation
no_norm <- estimateCommonDisp(d)
no_norm <- exactTest(no_norm)
table(p.adjust(no_norm$table$PValue, method="BH")<0.05)

FALSE  TRUE 
33404  4031

#TMM
TMM <- calcNormFactors(d, method="TMM")
TMM
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215696     0     0     0     0     0     0     0
ENSG00000215700     0     0     0     0     0     0     0
ENSG00000215699     0     0     0     0     0     0     0
ENSG00000215784     0     0     0     0     0     0     0
ENSG00000212914     0     0     0     0     0     0     0
37430 more rows ...

$samples
        group lib.size norm.factors
lane1 Control   978576    1.0350786
lane2 Control  1156844    1.0379515
lane3 Control  1442169    1.0287815
lane4 Control  1485604    1.0222095
lane5     DHT  1823460    0.9446243
lane6     DHT  1834335    0.9412769
lane8     DHT   681743    0.9954283

TMM <- estimateCommonDisp(TMM)
TMM <- exactTest(TMM)
table(p.adjust(TMM$table$PValue, method="BH")<0.05)

FALSE  TRUE 
33519  3916

#RLE
RLE <- calcNormFactors(d, method="RLE")
RLE
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215696     0     0     0     0     0     0     0
ENSG00000215700     0     0     0     0     0     0     0
ENSG00000215699     0     0     0     0     0     0     0
ENSG00000215784     0     0     0     0     0     0     0
ENSG00000212914     0     0     0     0     0     0     0
37430 more rows ...

$samples
        group lib.size norm.factors
lane1 Control   978576    1.0150010
lane2 Control  1156844    1.0236675
lane3 Control  1442169    1.0345426
lane4 Control  1485604    1.0399724
lane5     DHT  1823460    0.9706692
lane6     DHT  1834335    0.9734955
lane8     DHT   681743    0.9466713

RLE <- estimateCommonDisp(RLE)
RLE <- exactTest(RLE)
table(p.adjust(RLE$table$PValue, method="BH")<0.05)

FALSE  TRUE 
33465  3970

#upper-quartile
uq <- calcNormFactors(d, method="upperquartile")
uq
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215696     0     0     0     0     0     0     0
ENSG00000215700     0     0     0     0     0     0     0
ENSG00000215699     0     0     0     0     0     0     0
ENSG00000215784     0     0     0     0     0     0     0
ENSG00000212914     0     0     0     0     0     0     0
37430 more rows ...

$samples
        group lib.size norm.factors
lane1 Control   978576    1.0272514
lane2 Control  1156844    1.0222982
lane3 Control  1442169    1.0250528
lane4 Control  1485604    1.0348864
lane5     DHT  1823460    0.9728534
lane6     DHT  1834335    0.9670858
lane8     DHT   681743    0.9541011

uq <- estimateCommonDisp(uq)
uq <- exactTest(uq)
table(p.adjust(uq$table$PValue, method="BH")<0.05)

FALSE  TRUE 
33466  3969