goseq using pnas_expression_filtered.tsv

goseq is a Bioconductor package for performing a gene ontology analysis for RNA-seq and other length biased data. For more information on how to use the package, please refer to the vignette. For more information on the pnas_expression file please refer to my older post.

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("goseq")

library(goseq)
data <- read.table("pnas_expression_filtered.tsv",header=T,row.names=1)
nrow(data)
[1] 16494
head(data)
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208   478   619   628   744   483   716   240
ENSG00000182463    27    20    27    26    48    55    24
ENSG00000124201   180   218   293   275   373   301    88
ENSG00000124207    76    80    85    97    80    81    37
ENSG00000125835   132   200   200   228   280   204    52
ENSG00000125834    42    60    72    86   131    99    30

d <- data[,1:7]
rownames(d) <- row.names(data)

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("edgeR")

library(edgeR)
group <- c(rep("Control",4),rep("Test",3))
d <- DGEList(counts = d, group=group)
dim(d)
[1] 16494     7
d <- calcNormFactors(d)
d
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208   478   619   628   744   483   716   240
ENSG00000182463    27    20    27    26    48    55    24
ENSG00000124201   180   218   293   275   373   301    88
ENSG00000124207    76    80    85    97    80    81    37
ENSG00000125835   132   200   200   228   280   204    52
16489 more rows ...

$samples
        group lib.size norm.factors
lane1 Control   976847    1.0296636
lane2 Control  1154746    1.0372521
lane3 Control  1439393    1.0362662
lane4 Control  1482652    1.0378383
lane5    Test  1820628    0.9537095
lane6    Test  1831553    0.9525624
lane8    Test   680798    0.9583181

d <- estimateCommonDisp(d)
d$common.dispersion
[1] 0.02001965
sqrt(d$common.dispersion)
[1] 0.1414908
de.com <- exactTest(d)

d <- estimateTagwiseDisp(d)
de.tgw <- exactTest(d)
summary(decideTestsDGE(de.tgw, p.value=0.01))
   [,1] 
-1  1404
0  13372
1   1718

genes=as.integer(p.adjust(de.tgw$table$PValue[de.tgw$table$logFC!=0],method="BH")<.01)
names(genes)=row.names(de.tgw$table[de.tgw$table$logFC!=0,])
table(genes)
genes
    0     1 
13372  3122
head(genes)
ENSG00000124208 ENSG00000182463 ENSG00000124201 ENSG00000124207 ENSG00000125835 
              0               0               0               0               0 
ENSG00000125834 
              0
pwf=nullp(genes,"hg19","ensGene")

The x-axis of the plot above are binned lengths of Ensembl genes and the y-axis is the ratio of differentially detected genes. We can see a clear bias in the detection of differential expression with longer genes.


GO.wall=goseq(pwf,"hg19","ensGene")
head(GO.wall)
       category over_represented_pvalue under_represented_pvalue
2269 GO:0005623            1.741568e-52                        1
9863 GO:0044464            1.741568e-52                        1
9830 GO:0044424            2.533098e-49                        1
2268 GO:0005622            2.088365e-48                        1
2238 GO:0005575            6.005558e-48                        1
2321 GO:0005737            2.845073e-47                        1

table(GO.wall$over_represented_pvalue<0.01)

FALSE  TRUE 
14092   844

GO.samp=goseq(pwf,"hg19","ensGene",method="Sampling",repcnt=1000)
head(GO.samp)
     category over_represented_pvalue under_represented_pvalue
36 GO:0000075             0.000999001                        1
41 GO:0000082             0.000999001                        1
46 GO:0000087             0.000999001                        1
67 GO:0000122             0.000999001                        1
77 GO:0000139             0.000999001                        1
92 GO:0000166             0.000999001                        1

table(GO.samp$over_represented_pvalue<0.05)

FALSE  TRUE 
13413  1523

head(GO.nobias)
       category over_represented_pvalue under_represented_pvalue
2269 GO:0005623            1.038052e-67                        1
9863 GO:0044464            1.038052e-67                        1
2238 GO:0005575            3.014341e-64                        1
9830 GO:0044424            3.255426e-60                        1
2268 GO:0005622            6.342680e-60                        1
3482 GO:0008150            1.916257e-59                        1

table(GO.nobias$over_represented_pvalue<0.01)

FALSE  TRUE 
13961   975

summary(GO.wall$category[p.adjust(GO.wall$over_represented_pvalue,method="BH")<.01])
   Length     Class      Mode 
      404 character character

library(GO.db)
for(go in wall.enriched.GO[1:5]){
   print(GOTERM[[go]])
   cat("--------------------------------------\n")
}

GOID: GO:0005623
Term: cell
Ontology: CC
Definition: The basic structural and functional unit of all organisms.
    Includes the plasma membrane and any external encapsulating
    structures such as the cell wall and cell envelope.
--------------------------------------
GOID: GO:0044464
Term: cell part
Ontology: CC
Definition: Any constituent part of a cell, the basic structural and
    functional unit of all organisms.
Synonym: protoplast
--------------------------------------
GOID: GO:0044424
Term: intracellular part
Ontology: CC
Definition: Any constituent part of the living contents of a cell; the
    matter contained within (but not including) the plasma membrane,
    usually taken to exclude large vacuoles and masses of secretory or
    ingested material. In eukaryotes it includes the nucleus and
    cytoplasm.
--------------------------------------
GOID: GO:0005622
Term: intracellular
Ontology: CC
Definition: The living contents of a cell; the matter contained within
    (but not including) the plasma membrane, usually taken to exclude
    large vacuoles and masses of secretory or ingested material. In
    eukaryotes it includes the nucleus and cytoplasm.
Synonym: internal to cell
Synonym: nucleocytoplasm
Synonym: protoplasm
Synonym: protoplast
--------------------------------------
GOID: GO:0005575
Term: cellular_component
Ontology: CC
Definition: The part of a cell or its extracellular environment in
    which a gene product is located. A gene product may be located in
    one or more parts of a cell and its location may be as specific as
    a particular macromolecular complex, that is, a stable, persistent
    association of macromolecules that function together.
Synonym: cellular component
Synonym: cellular component unknown
Synonym: GO:0008372
Secondary: GO:0008372
--------------------------------------

sessionInfo()
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
[3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] GO.db_2.8.0             org.Hs.eg.db_2.8.0      RSQLite_0.11.2         
 [4] DBI_0.2-5               AnnotationDbi_1.20.2    Biobase_2.18.0         
 [7] BiocGenerics_0.4.0      edgeR_3.0.2             limma_3.14.1           
[10] goseq_1.10.0            geneLenDataBase_0.99.10 BiasedUrn_1.04         
[13] BiocInstaller_1.8.3    

loaded via a namespace (and not attached):
 [1] biomaRt_2.14.0         Biostrings_2.26.2      bitops_1.0-4.2        
 [4] BSgenome_1.26.1        GenomicFeatures_1.10.0 GenomicRanges_1.10.5  
 [7] grid_2.15.2            IRanges_1.16.4         lattice_0.20-10       
[10] Matrix_1.0-10          mgcv_1.7-22            nlme_3.1-105          
[13] parallel_2.15.2        RCurl_1.95-3           Rsamtools_1.10.2      
[16] rtracklayer_1.18.0     stats4_2.15.2          tools_2.15.2          
[19] XML_3.95-0.1           zlibbioc_1.4.0




Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
Posted in RTagged ,

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.