Using the Bioconductor annotation packages

Another post related to this course I'm going through (I can't link it enough times). I have almost finished with the first day of the course and couldn't resist writing about this lecture on using the Bioconductor annotation packages. I had not realised that the annotation packages could be queried (pardon my ignorance) in the same manner as using SQL statements. Please see the lecture for more elaboration.

#install the Bioconductor annotation package if necessary
source("http://bioconductor.org/biocLite.R")
biocLite(org.Hs.eg.db)

#load the human Entrez gene identifiers package
library(org.Hs.eg.db)

#find out the columns of this annotation database
#imagine a SQL table, these are the columns
cols(org.Hs.eg.db)
 [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"       "ALIAS"        "CHR"         
 [8] "CHRLOC"       "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"         "PMID"         "REFSEQ"      
[15] "SYMBOL"       "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"     "UNIPROT"     
[22] "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"        
[29] "UCSCKG"

#find out the columns we can use to search the database
keytypes(org.Hs.eg.db)
 [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"       "ALIAS"        "CHR"         
 [8] "CHRLOC"       "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"         "PMID"         "REFSEQ"      
[15] "SYMBOL"       "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"     "UNIPROT"     
[22] "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"        
[29] "UCSCKG"

#which chromosome does the gene TP53 reside?
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL", "CHR"), keytype="SYMBOL")
  SYMBOL CHR
1   TP53  17

#find out the keys for the keytype CHR
keys(org.Hs.eg.db, "CHR")
 [1] "19" "12" "8"  "14" "3"  "2"  "17" "16" "9"  "X"  "6"  "1"  "7"  "10" "11" "22" "5"  "18" "15" "Y"  "20"
[22] "21" "4"  "13" "MT" "Un"

#this question was asked in the lecture
#How many genes are there on chromosome 22 are in this annotation database?
#store all symbols
symbol <- keys(org.Hs.eg.db, "SYMBOL")
#how many gene symbols
length(symbol)
[1] 43819
#distribution of gene symbols along the chromosomes
symbol_chr <- select(org.Hs.eg.db, keys=symbol, cols=c("CHR","SYMBOL"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows
#the above warning is for duplicated rows
#double check how many gene symbols are in symbol_chr
length(symbol_chr$SYMBOL)
[1] 43868
#unique ones
length(unique(symbol_chr$SYMBOL))
[1] 43819
#distribution of symbols on chromosomes
table(symbol_chr$CHR)

   1   10   11   12   13   14   15   16   17   18   19    2   20   21   22    3    4    5    6    7    8 
4173 1604 2524 1945 1124 1679 1475 1597 2079  674 2245 2812 1017  546 1029 2306 1702 1887 2432 2307 1586 
   9   MT   Un    X    Y 
1790   37  219 2087  535

plot(table(symbol_chr$CHR), xlab="Chromosomes", ylab="Number of genes")

org_hs_eg_db_chr_geneYes I should have sorted the chromosomes.


#Find the GO IDs for the TP53 gene
tp53_go <- select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","GO"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows
head(tp53_go)
  SYMBOL         GO EVIDENCE ONTOLOGY
1   TP53 GO:0000060      IEA       BP
2   TP53 GO:0000075      TAS       BP
3   TP53 GO:0000122      IBA       BP
4   TP53 GO:0000122      IDA       BP
5   TP53 GO:0000122      ISS       BP
6   TP53 GO:0000733      IDA       BP
dim(tp53_go)
[1] 136   4
#Hmm that warning message is worrying
#especially since each row is unique
dim(unique(tp53_go))
[1] 136   4

#let's see what the warning message is about
#which GO terms are duplicated?
dup <- tp53_go$GO[duplicated(tp53_go$GO)]
dup
 [1] "GO:0000122" "GO:0000122" "GO:0005654" "GO:0005829" "GO:0006915" "GO:0007050" "GO:0030330" "GO:0043066"
 [9] "GO:0045893" "GO:0045944" "GO:0045944"
#make dup a dataframe
dup_df <- data.frame(GO=dup)
#merge tp53_go with dup_df
merge(x=tp53_go, y=dup_df, by="GO")
           GO SYMBOL EVIDENCE ONTOLOGY
1  GO:0000122   TP53      ISS       BP
2  GO:0000122   TP53      ISS       BP
3  GO:0000122   TP53      IBA       BP
4  GO:0000122   TP53      IBA       BP
5  GO:0000122   TP53      IDA       BP
6  GO:0000122   TP53      IDA       BP
7  GO:0005654   TP53      IDA       CC
8  GO:0005654   TP53      TAS       CC
9  GO:0005829   TP53      IDA       CC
10 GO:0005829   TP53      IBA       CC
11 GO:0006915   TP53      IDA       BP
12 GO:0006915   TP53      IMP       BP
13 GO:0007050   TP53      IMP       BP
14 GO:0007050   TP53      IDA       BP
15 GO:0030330   TP53      IDA       BP
16 GO:0030330   TP53      IMP       BP
17 GO:0043066   TP53      IDA       BP
18 GO:0043066   TP53      IEA       BP
19 GO:0045893   TP53      IDA       BP
20 GO:0045893   TP53      IMP       BP
21 GO:0045944   TP53      IDA       BP
22 GO:0045944   TP53      IDA       BP
23 GO:0045944   TP53      IGI       BP
24 GO:0045944   TP53      IGI       BP
25 GO:0045944   TP53      IMP       BP
26 GO:0045944   TP53      IMP       BP

#dup_df has duplicated GO terms hence merge returned duplicated rows
#which you can see below
dup_df
           GO
1  GO:0000122
2  GO:0000122
3  GO:0005654
4  GO:0005829
5  GO:0006915
6  GO:0007050
7  GO:0030330
8  GO:0043066
9  GO:0045893
10 GO:0045944
11 GO:0045944

#So perhaps we got a warning because of the duplicated GO terms
#which are there because those GO terms were supported by different evidence

#let's try use biomaRt to compare results
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")

#for more informatoin on biomaRt see my older post http://davetang.org/muse/2012/04/27/learning-to-use-biomart/
library("biomaRt")
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#find out the Ensembl id of TP53
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","ENSEMBL"), keytype="SYMBOL")
  SYMBOL         ENSEMBL
1   TP53 ENSG00000141510

#search GO terms associated with ENSG00000141510
getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)
tp53_go_biomart <- getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)

length(tp53_go_biomart)
[1] 135

#order the vector
tp53_go_biomart <- tp53_go_biomart[order(tp53_go_biomart)]

#create comparison vector of GO terms collected from org.Hs.eg.db
comparison <- tp53_go$GO
comparison <- comparison[order(comparison)]

#how many are exclusive of each other
setdiff(tp53_go_biomart,comparison)
 [1] "GO:0000739" "GO:0005626" "GO:0005678" "GO:0006351" "GO:0006979" "GO:0007049" "GO:0008629" "GO:0009411"
 [9] "GO:0009792" "GO:0042127" "GO:0042493" "GO:0043523" "GO:0044419" "GO:0046872" "GO:0051726" "GO:0070215"

#how many overlap
length(intersect(tp53_go_biomart,comparison))
[1] 119

#what if I search the annotation package
#using the Ensembl id

tp53_go_2 <- select(org.Hs.eg.db, keys="ENSG00000141510", cols=c("ENSEMBL","GO"), keytype="ENSEMBL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows

setdiff(tp53_go_2$GO,tp53_go$GO)
character(0)

I am not sure of the differences between the GO terms fetched using the Bioconductor annotation package and fetched using biomaRt. I initially thought it was because I was searching using the gene symbols but a search using the Ensembl ID returned the same result. As this post was a demonstration of how to query the Bioconductor annotation packages, I didn't delve into this inconsistency.

In conclusion, I think the Bioconductor annotation packages provide a very valuable resource with many useful annotation packages, especially if you're working with microarrays. And I just found out that the FANTOM4 set of promoters are also there.

What about microarray data?

Here's an example of going from Illumina array address ids (Array_Address_Id) to gene symbols for the illuminaMousev1p1 chip.

From the manual:

illuminaMousev1p1SYMBOL - Map between Manufacturer Identifiers and Gene Symbols

illuminaMousev1p1ARRAYADDRESS - Array Address code used to identify the probe at the bead-level

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("illuminaMousev1p1.db")
library("illuminaMousev1p1.db")

gs <- illuminaMousev1p1SYMBOL
aa <- illuminaMousev1p1ARRAYADDRESS
gs_probe <- mappedkeys(gs)
aa_probe <- mappedkeys(aa)

?mappedkeys
#Methods for manipulating the keys of a Bimap object

head(gs_probe)
[1] "ILMN_1212602" "ILMN_1212605" "ILMN_1212607" "ILMN_1212610" "ILMN_1212612" "ILMN_1212614"
head(aa_probe)
[1] "ILMN_1243094" "ILMN_2454720" "ILMN_1238674" "ILMN_1240307" "ILMN_1251139" "ILMN_2486639"

gs_probe_lookup <- as.list(gs[gs_probe])
aa_probe_lookup <- as.list(aa[aa_probe])
head(gs_probe_lookup)
$ILMN_1212602
[1] "Best1"

$ILMN_1212605
[1] "1500011K16Rik"

$ILMN_1212607
[1] "Cradd"

$ILMN_1212610
[1] "Zfp626"

$ILMN_1212612
[1] "Rcan2"

$ILMN_1212614
[1] "Med12l"

head(aa_probe_lookup)
$ILMN_1243094
[1] "102690609"

$ILMN_2454720
[1] "104050132"

$ILMN_1238674
[1] "1240433"

$ILMN_1240307
[1] "105420348"

$ILMN_1251139
[1] "4670687"

$ILMN_2486639
[1] "104210685"

probe_symbol <- data.frame(probe=names(gs_probe_lookup),symbol=unlist(gs_probe_lookup, use.names=F))
probe_array_address <- data.frame(probe=names(aa_probe_lookup),array_address=unlist(aa_probe_lookup, use.names=F))

head(probe_symbol)
         probe        symbol
1 ILMN_1212602         Best1
2 ILMN_1212605 1500011K16Rik
3 ILMN_1212607         Cradd
4 ILMN_1212610        Zfp626
5 ILMN_1212612         Rcan2
6 ILMN_1212614        Med12l

head(probe_array_address)
         probe array_address
1 ILMN_1243094     102690609
2 ILMN_2454720     104050132
3 ILMN_1238674       1240433
4 ILMN_1240307     105420348
5 ILMN_1251139       4670687
6 ILMN_2486639     104210685

symbol_array_address <- merge(x=probe_symbol, y=probe_array_address, by="probe")
head(symbol_array_address)
         probe        symbol array_address
1 ILMN_1212602         Best1       5270632
2 ILMN_1212605 1500011K16Rik       1850025
3 ILMN_1212607         Cradd       6770520
4 ILMN_1212610        Zfp626     106550193
5 ILMN_1212612         Rcan2        940438
6 ILMN_1212614        Med12l     102510647

my_list <- c('102690609','105420348','104210685','520372','6420059')

#remove the factoring
symbol_array_address$array_address <- levels(droplevels(symbol_array_address$array_address))
subset(symbol_array_address, array_address %in% my_list)
             probe  symbol array_address
4826  ILMN_1229689 Ammecr1     104210685
22287 ILMN_2664619 Skint10        520372
26920 ILMN_2733155   Fezf1       6420059

Print Friendly, PDF & Email



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
3 comments Add yours
  1. I encountered this today and I think the problem is that the “EVIDENCE” column is different when really, we don’t care about that. Look at unique(tp53_go[,-3]).

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.