Bioconductor annotation packages

The Bioconductor annotation packages are an extensive collection of annotations. For this post I simply illustrate the basics of probing these annotation packages. For the first example I will use the org.Hs.eg.db package, which provides genome wide annotations for the human genome.

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
#load library
library(org.Hs.eg.db)

class(org.Hs.eg.db)
[1] "OrgDb"
attr(,"package")
[1] "AnnotationDbi"

We can query the package by using the select() function; to find out what we can select and return we can use the keys(), columns() and keytypes() functions:

head(keys(org.Hs.eg.db))
#[1] "1"         "10"        "100"       "1000"      "10000"     "100008586"

#store the first six keys
my_keys <- head(keys(org.Hs.eg.db))

keytypes(org.Hs.eg.db)
# [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"      
# [6] "ALIAS"        "CHR"          "CHRLOC"       "CHRLOCEND"    "ENZYME"      
#[11] "MAP"          "PATH"         "PMID"         "REFSEQ"       "SYMBOL"      
#[16] "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"    
#[21] "UNIPROT"      "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"       
#[26] "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"         "UCSCKG"

#same as above
columns(org.Hs.eg.db)
# [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"      
# [6] "ALIAS"        "CHR"          "CHRLOC"       "CHRLOCEND"    "ENZYME"      
#[11] "MAP"          "PATH"         "PMID"         "REFSEQ"       "SYMBOL"      
#[16] "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"    
#[21] "UNIPROT"      "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"       
#[26] "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"         "UCSCKG"

#selecting
select(org.Hs.eg.db,
       keys = my_keys,
       columns=c("ENTREZID","SYMBOL","GENENAME"),
       keytype="ENTREZID")

#   ENTREZID  SYMBOL                                              GENENAME
#1         1    A1BG                                alpha-1-B glycoprotein
#2        10    NAT2 N-acetyltransferase 2 (arylamine N-acetyltransferase)
#3       100     ADA                                   adenosine deaminase
#4      1000    CDH2             cadherin 2, type 1, N-cadherin (neuronal)
#5     10000    AKT3         v-akt murine thymoma viral oncogene homolog 3
#6 100008586 GAGE12F                                         G antigen 12F

Within the annotation package are many objects, which we can also probe:

ls("package:org.Hs.eg.db")
# [1] "org.Hs.eg"                "org.Hs.eg.db"            
# [3] "org.Hs.eg_dbconn"         "org.Hs.eg_dbfile"        
# [5] "org.Hs.eg_dbInfo"         "org.Hs.eg_dbschema"      
# [7] "org.Hs.egACCNUM"          "org.Hs.egACCNUM2EG"      
# [9] "org.Hs.egALIAS2EG"        "org.Hs.egCHR"            
#[11] "org.Hs.egCHRLENGTHS"      "org.Hs.egCHRLOC"         
#[13] "org.Hs.egCHRLOCEND"       "org.Hs.egENSEMBL"        
#[15] "org.Hs.egENSEMBL2EG"      "org.Hs.egENSEMBLPROT"    
#[17] "org.Hs.egENSEMBLPROT2EG"  "org.Hs.egENSEMBLTRANS"   
#[19] "org.Hs.egENSEMBLTRANS2EG" "org.Hs.egENZYME"         
#[21] "org.Hs.egENZYME2EG"       "org.Hs.egGENENAME"       
#[23] "org.Hs.egGO"              "org.Hs.egGO2ALLEGS"      
#[25] "org.Hs.egGO2EG"           "org.Hs.egMAP"            
#[27] "org.Hs.egMAP2EG"          "org.Hs.egMAPCOUNTS"      
#[29] "org.Hs.egOMIM"            "org.Hs.egOMIM2EG"        
#[31] "org.Hs.egORGANISM"        "org.Hs.egPATH"           
#[33] "org.Hs.egPATH2EG"         "org.Hs.egPFAM"           
#[35] "org.Hs.egPMID"            "org.Hs.egPMID2EG"        
#[37] "org.Hs.egPROSITE"         "org.Hs.egREFSEQ"         
#[39] "org.Hs.egREFSEQ2EG"       "org.Hs.egSYMBOL"         
#[41] "org.Hs.egSYMBOL2EG"       "org.Hs.egUCSCKG"         
#[43] "org.Hs.egUNIGENE"         "org.Hs.egUNIGENE2EG"     
#[45] "org.Hs.egUNIPROT"

class(org.Hs.egENSEMBL2EG)
#[1] "AnnDbBimap"
#attr(,"package")
#[1] "AnnotationDbi"

head(keys(org.Hs.egENSEMBL2EG))
#[1] "ENSG00000121410" "ENSG00000175899" "ENSG00000256069" "ENSG00000171428"
#[5] "ENSG00000156006" "ENSG00000196136"

#store some Ensembl Gene ids to probe
my_ensg_keys <- head(keys(org.Hs.egENSEMBL2EG))

#lookup between Ensembl Gene ids to Entrez Gene ids
select(org.Hs.eg.db,
       keys = my_ensg_keys,
       columns=c("ENSEMBL","ENTREZID","SYMBOL","GENENAME"),
       keytype="ENSEMBL")

#          ENSEMBL ENTREZID   SYMBOL
#1 ENSG00000121410        1     A1BG
#2 ENSG00000175899        2      A2M
#3 ENSG00000256069        3    A2MP1
#4 ENSG00000171428        9     NAT1
#5 ENSG00000156006       10     NAT2
#6 ENSG00000196136       12 SERPINA3
#                                                                             GENENAME
#1                                                              alpha-1-B glycoprotein
#2                                                               alpha-2-macroglobulin
#3                                                  alpha-2-macroglobulin pseudogene 1
#4                               N-acetyltransferase 1 (arylamine N-acetyltransferase)
#5                               N-acetyltransferase 2 (arylamine N-acetyltransferase)
#6 serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3

Using the annotation package org.Hs.eg.db we can easily convert different gene identifiers, obtain their gene symbols and descriptions (as well as all other keytypes).

There are also other annotation packages, such as GO.db, which contains a set of annotation maps describing the entire Gene Ontology that we can probe in the same manner as above:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("GO.db")
#load library
library(GO.db)

class(GO.db)
[1] "GODb"
attr(,"package")
[1] "AnnotationDbi"

keytypes(GO.db)
#[1] "GOID"       "TERM"       "ONTOLOGY"   "DEFINITION"

my_go_keys <- head(keys(GO.db))
my_go_keys
#[1] "GO:0000001" "GO:0000002" "GO:0000003" "GO:0000006" "GO:0000007" "GO:0000009"

select(GO.db,
       keys = my_go_keys,
       columns=c("GOID", "TERM", "ONTOLOGY"),
       keytype="GOID")

#        GOID                                                         TERM
#1 GO:0000001                                    mitochondrion inheritance
#2 GO:0000002                             mitochondrial genome maintenance
#3 GO:0000003                                                 reproduction
#4 GO:0000006 high affinity zinc uptake transmembrane transporter activity
#5 GO:0000007     low-affinity zinc ion transmembrane transporter activity
#6 GO:0000009                       alpha-1,6-mannosyltransferase activity
#  ONTOLOGY
#1       BP
#2       BP
#3       BP
#4       MF
#5       MF
#6       MF

For genome annotation purposes, one may be interested in the gene models, which are provided in the TxDb.Hsapiens.UCSC.hg19.knownGene package:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
#load library
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

class(TxDb.Hsapiens.UCSC.hg19.knownGene)
#[1] "TranscriptDb"
#attr(,"package")
#[1] "GenomicFeatures"

#to see what columns can be returned
# use the columns() function
columns(TxDb.Hsapiens.UCSC.hg19.knownGene)
# [1] "CDSID"      "CDSNAME"    "CDSCHROM"   "CDSSTRAND"  "CDSSTART"  
# [6] "CDSEND"     "EXONID"     "EXONNAME"   "EXONCHROM"  "EXONSTRAND"
#[11] "EXONSTART"  "EXONEND"    "GENEID"     "TXID"       "EXONRANK"  
#[16] "TXNAME"     "TXCHROM"    "TXSTRAND"   "TXSTART"    "TXEND"

select(TxDb.Hsapiens.UCSC.hg19.knownGene,
       keys = head(keys(TxDb.Hsapiens.UCSC.hg19.knownGene)),
       columns=c('GENEID', 'TXCHROM', 'TXSTART', 'TXEND', 'TXID'),
       keytype="GENEID")

#      GENEID  TXID TXCHROM   TXSTART     TXEND
#1          1 70455   chr19  58858172  58864865
#2          1 70456   chr19  58859832  58874214
#3         10 31944    chr8  18248755  18258723
#4        100 72132   chr20  43248163  43280376
#5       1000 65378   chr18  25530930  25616539
#6       1000 65379   chr18  25530930  25757445
#7      10000  7895    chr1 243651535 244006584
#8      10000  7896    chr1 243663021 244006584
#9      10000  7897    chr1 243663021 244006886
#10 100008586 75890    chrX  49217763  49233491

We can combine different annotation packages too; say I have two genes LRRK2 and PINK1, and I want their genomic locations as well as OMIM ids:

my_genes <- c("LRRK2","PINK1")

#symbol and OMIM
a <- select(org.Hs.eg.db,
            keys = my_genes,
            columns=c("ENTREZID", "SYMBOL","OMIM"),
            keytype="SYMBOL")

a
#  SYMBOL ENTREZID   OMIM
#1  LRRK2   120892 607060
#2  LRRK2   120892 609007
#3  PINK1    65018 605909
#4  PINK1    65018 608309

#symbol and transcript location
b <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,
            keys = a$ENTREZID,
            columns=c('GENEID', 'TXCHROM', 'TXSTART', 'TXEND', 'TXID'),
            keytype="GENEID")

b
#  GENEID  TXID TXCHROM  TXSTART    TXEND
#1 120892 46046   chr12 40618813 40763086
#2 120892 46047   chr12 40645267 40698778
#3 120892 46048   chr12 40689229 40763086
#4 120892 46049   chr12 40696591 40763086
#5  65018   551    chr1 20959948 20978004
#6  65018   552    chr1 20972001 20978004

#change GENEID column to ENTREZID for consistency
#and for merging

names(b) <- c('ENTREZID', 'TXID', 'TXCHROM', 'TXSTART', 'TXEND')
c <- merge(a, b, 'ENTREZID')
c
#   ENTREZID SYMBOL   OMIM  TXID TXCHROM  TXSTART    TXEND
#1    120892  LRRK2 607060 46046   chr12 40618813 40763086
#2    120892  LRRK2 607060 46047   chr12 40645267 40698778
#3    120892  LRRK2 607060 46048   chr12 40689229 40763086
#4    120892  LRRK2 607060 46049   chr12 40696591 40763086
#5    120892  LRRK2 609007 46046   chr12 40618813 40763086
#6    120892  LRRK2 609007 46047   chr12 40645267 40698778
#7    120892  LRRK2 609007 46048   chr12 40689229 40763086
#8    120892  LRRK2 609007 46049   chr12 40696591 40763086
#9     65018  PINK1 605909   551    chr1 20959948 20978004
#10    65018  PINK1 605909   552    chr1 20972001 20978004
#11    65018  PINK1 608309   551    chr1 20959948 20978004
#12    65018  PINK1 608309   552    chr1 20972001 20978004

Conclusions

One important point to be aware of when using the Bioconductor annotation packages is to appreciate the lag between updates to the packages. From the FAQ on the Bioconductor website:

Different sources take different approaches to managing annotations. The annotation packages in Bioconductor are based on downloads obtained shortly before each Bioconductor release, and so can lag by six months (at the end of the release cycle) compared to on-line resources. The advantage of this approach is that the annotations do not change unexpectedly during development of an analysis, while the disadvantage is that the resource is not quite up-to-date with current understanding.

See also

My first post on using the Bioconductor annotation packages (I had forgotten that I’ve written about this already. And if I blog about it twice, it must be worth writing about!).

Introduction to the Annotation Packages.

Checking whether two genes belong to the same pathway using Bioconductor annotation packages.

Using biomaRt

Print Friendly, PDF & Email



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
6 comments Add yours
  1. Very helpful.
    I used org.Hs.eg.db version 3.0.0

    How know which version of human genome is used?

    In org.Hs.eg.db manual (March25,2015) is write that “All mappings were based on data provided by: Gene Ontology ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/ With a date stamp from the source of: 20140913″, that is to say ftp://ftp.geneontology.org/pub/go/godatabase/archive/lite/2014-09-13/

    But I don’t find the file which make the association between gene ID and GO ID!!

      1. Thanks for your response:
        From org.Hs.eg.db_3.0.0, I obtain 47721 genes (length(keys(org.Hs.eg.db))). Thus, 18229 genes have a GO annotation (genes which have no annotation are tag ). It corresponds to 14714 GO Ids.

        select(org.Hs.eg.db,keys = my_keys,columns=c(“ENTREZID”,”GO”,”EVIDENCE”,”ONTOLOGY”),keytype=”ENTREZID”)

        In the file ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz file, I notice that there is 18585 genes with the taxon ID : 9606. It corresponds to 15318 GO IDs.

        Why there is a difference !! gene2go is it the file used in org.Hs.eg.db ?

  2. I’m having trouble installing org.Hs.eg.db and GO.db.

    source(“http://bioconductor.org/biocLite.R”)
    biocLite(“AnnotationDbi”)

    ** testing if installed package can be loaded
    Error : .onLoad failed in loadNamespace() for ‘org.Hs.eg.db’, details:
    call: match.arg(synchronous, c(“off”, “normal”, “full”))
    error: ‘arg’ must be NULL or a character vector
    Error: loading failed
    Execution halted
    ERROR: loading failed
    * removing ‘/home/apadron/R/x86_64-pc-linux-gnu-library/3.0/org.Hs.eg.db’

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.