Another post related to this course I'm going through (I can't link it enough times). I have almost finished with the first day of the course and couldn't resist writing about this lecture on using the Bioconductor annotation packages. I had not realised that the annotation packages could be queried (pardon my ignorance) in the same manner as using SQL statements. Please see the lecture for more elaboration.
#install the Bioconductor annotation package if necessary
source("http://bioconductor.org/biocLite.R")
biocLite(org.Hs.eg.db)
#load the human Entrez gene identifiers package
library(org.Hs.eg.db)
#find out the columns of this annotation database
#imagine a SQL table, these are the columns
cols(org.Hs.eg.db)
[1] "ENTREZID" "PFAM" "IPI" "PROSITE" "ACCNUM" "ALIAS" "CHR"
[8] "CHRLOC" "CHRLOCEND" "ENZYME" "MAP" "PATH" "PMID" "REFSEQ"
[15] "SYMBOL" "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME" "UNIPROT"
[22] "GO" "EVIDENCE" "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM"
[29] "UCSCKG"
#find out the columns we can use to search the database
keytypes(org.Hs.eg.db)
[1] "ENTREZID" "PFAM" "IPI" "PROSITE" "ACCNUM" "ALIAS" "CHR"
[8] "CHRLOC" "CHRLOCEND" "ENZYME" "MAP" "PATH" "PMID" "REFSEQ"
[15] "SYMBOL" "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME" "UNIPROT"
[22] "GO" "EVIDENCE" "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM"
[29] "UCSCKG"
#which chromosome does the gene TP53 reside?
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL", "CHR"), keytype="SYMBOL")
SYMBOL CHR
1 TP53 17
#find out the keys for the keytype CHR
keys(org.Hs.eg.db, "CHR")
[1] "19" "12" "8" "14" "3" "2" "17" "16" "9" "X" "6" "1" "7" "10" "11" "22" "5" "18" "15" "Y" "20"
[22] "21" "4" "13" "MT" "Un"
#this question was asked in the lecture
#How many genes are there on chromosome 22 are in this annotation database?
#store all symbols
symbol <- keys(org.Hs.eg.db, "SYMBOL")
#how many gene symbols
length(symbol)
[1] 43819
#distribution of gene symbols along the chromosomes
symbol_chr <- select(org.Hs.eg.db, keys=symbol, cols=c("CHR","SYMBOL"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
'select' resulted in 1:many mapping between keys and return rows
#the above warning is for duplicated rows
#double check how many gene symbols are in symbol_chr
length(symbol_chr$SYMBOL)
[1] 43868
#unique ones
length(unique(symbol_chr$SYMBOL))
[1] 43819
#distribution of symbols on chromosomes
table(symbol_chr$CHR)
1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8
4173 1604 2524 1945 1124 1679 1475 1597 2079 674 2245 2812 1017 546 1029 2306 1702 1887 2432 2307 1586
9 MT Un X Y
1790 37 219 2087 535
plot(table(symbol_chr$CHR), xlab="Chromosomes", ylab="Number of genes")
Yes I should have sorted the chromosomes.
#Find the GO IDs for the TP53 gene
tp53_go <- select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","GO"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
'select' resulted in 1:many mapping between keys and return rows
head(tp53_go)
SYMBOL GO EVIDENCE ONTOLOGY
1 TP53 GO:0000060 IEA BP
2 TP53 GO:0000075 TAS BP
3 TP53 GO:0000122 IBA BP
4 TP53 GO:0000122 IDA BP
5 TP53 GO:0000122 ISS BP
6 TP53 GO:0000733 IDA BP
dim(tp53_go)
[1] 136 4
#Hmm that warning message is worrying
#especially since each row is unique
dim(unique(tp53_go))
[1] 136 4
#let's see what the warning message is about
#which GO terms are duplicated?
dup <- tp53_go$GO[duplicated(tp53_go$GO)]
dup
[1] "GO:0000122" "GO:0000122" "GO:0005654" "GO:0005829" "GO:0006915" "GO:0007050" "GO:0030330" "GO:0043066"
[9] "GO:0045893" "GO:0045944" "GO:0045944"
#make dup a dataframe
dup_df <- data.frame(GO=dup)
#merge tp53_go with dup_df
merge(x=tp53_go, y=dup_df, by="GO")
GO SYMBOL EVIDENCE ONTOLOGY
1 GO:0000122 TP53 ISS BP
2 GO:0000122 TP53 ISS BP
3 GO:0000122 TP53 IBA BP
4 GO:0000122 TP53 IBA BP
5 GO:0000122 TP53 IDA BP
6 GO:0000122 TP53 IDA BP
7 GO:0005654 TP53 IDA CC
8 GO:0005654 TP53 TAS CC
9 GO:0005829 TP53 IDA CC
10 GO:0005829 TP53 IBA CC
11 GO:0006915 TP53 IDA BP
12 GO:0006915 TP53 IMP BP
13 GO:0007050 TP53 IMP BP
14 GO:0007050 TP53 IDA BP
15 GO:0030330 TP53 IDA BP
16 GO:0030330 TP53 IMP BP
17 GO:0043066 TP53 IDA BP
18 GO:0043066 TP53 IEA BP
19 GO:0045893 TP53 IDA BP
20 GO:0045893 TP53 IMP BP
21 GO:0045944 TP53 IDA BP
22 GO:0045944 TP53 IDA BP
23 GO:0045944 TP53 IGI BP
24 GO:0045944 TP53 IGI BP
25 GO:0045944 TP53 IMP BP
26 GO:0045944 TP53 IMP BP
#dup_df has duplicated GO terms hence merge returned duplicated rows
#which you can see below
dup_df
GO
1 GO:0000122
2 GO:0000122
3 GO:0005654
4 GO:0005829
5 GO:0006915
6 GO:0007050
7 GO:0030330
8 GO:0043066
9 GO:0045893
10 GO:0045944
11 GO:0045944
#So perhaps we got a warning because of the duplicated GO terms
#which are there because those GO terms were supported by different evidence
#let's try use biomaRt to compare results
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
#for more informatoin on biomaRt see my older post http://davetang.org/muse/2012/04/27/learning-to-use-biomart/
library("biomaRt")
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#find out the Ensembl id of TP53
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","ENSEMBL"), keytype="SYMBOL")
SYMBOL ENSEMBL
1 TP53 ENSG00000141510
#search GO terms associated with ENSG00000141510
getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)
tp53_go_biomart <- getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)
length(tp53_go_biomart)
[1] 135
#order the vector
tp53_go_biomart <- tp53_go_biomart[order(tp53_go_biomart)]
#create comparison vector of GO terms collected from org.Hs.eg.db
comparison <- tp53_go$GO
comparison <- comparison[order(comparison)]
#how many are exclusive of each other
setdiff(tp53_go_biomart,comparison)
[1] "GO:0000739" "GO:0005626" "GO:0005678" "GO:0006351" "GO:0006979" "GO:0007049" "GO:0008629" "GO:0009411"
[9] "GO:0009792" "GO:0042127" "GO:0042493" "GO:0043523" "GO:0044419" "GO:0046872" "GO:0051726" "GO:0070215"
#how many overlap
length(intersect(tp53_go_biomart,comparison))
[1] 119
#what if I search the annotation package
#using the Ensembl id
tp53_go_2 <- select(org.Hs.eg.db, keys="ENSG00000141510", cols=c("ENSEMBL","GO"), keytype="ENSEMBL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
'select' resulted in 1:many mapping between keys and return rows
setdiff(tp53_go_2$GO,tp53_go$GO)
character(0)
I am not sure of the differences between the GO terms fetched using the Bioconductor annotation package and fetched using biomaRt. I initially thought it was because I was searching using the gene symbols but a search using the Ensembl ID returned the same result. As this post was a demonstration of how to query the Bioconductor annotation packages, I didn't delve into this inconsistency.
In conclusion, I think the Bioconductor annotation packages provide a very valuable resource with many useful annotation packages, especially if you're working with microarrays. And I just found out that the FANTOM4 set of promoters are also there.
What about microarray data?
Here's an example of going from Illumina array address ids (Array_Address_Id) to gene symbols for the illuminaMousev1p1 chip.
From the manual:
illuminaMousev1p1SYMBOL - Map between Manufacturer Identifiers and Gene Symbols
illuminaMousev1p1ARRAYADDRESS - Array Address code used to identify the probe at the bead-level
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("illuminaMousev1p1.db")
library("illuminaMousev1p1.db")
gs <- illuminaMousev1p1SYMBOL
aa <- illuminaMousev1p1ARRAYADDRESS
gs_probe <- mappedkeys(gs)
aa_probe <- mappedkeys(aa)
?mappedkeys
#Methods for manipulating the keys of a Bimap object
head(gs_probe)
[1] "ILMN_1212602" "ILMN_1212605" "ILMN_1212607" "ILMN_1212610" "ILMN_1212612" "ILMN_1212614"
head(aa_probe)
[1] "ILMN_1243094" "ILMN_2454720" "ILMN_1238674" "ILMN_1240307" "ILMN_1251139" "ILMN_2486639"
gs_probe_lookup <- as.list(gs[gs_probe])
aa_probe_lookup <- as.list(aa[aa_probe])
head(gs_probe_lookup)
$ILMN_1212602
[1] "Best1"
$ILMN_1212605
[1] "1500011K16Rik"
$ILMN_1212607
[1] "Cradd"
$ILMN_1212610
[1] "Zfp626"
$ILMN_1212612
[1] "Rcan2"
$ILMN_1212614
[1] "Med12l"
head(aa_probe_lookup)
$ILMN_1243094
[1] "102690609"
$ILMN_2454720
[1] "104050132"
$ILMN_1238674
[1] "1240433"
$ILMN_1240307
[1] "105420348"
$ILMN_1251139
[1] "4670687"
$ILMN_2486639
[1] "104210685"
probe_symbol <- data.frame(probe=names(gs_probe_lookup),symbol=unlist(gs_probe_lookup, use.names=F))
probe_array_address <- data.frame(probe=names(aa_probe_lookup),array_address=unlist(aa_probe_lookup, use.names=F))
head(probe_symbol)
probe symbol
1 ILMN_1212602 Best1
2 ILMN_1212605 1500011K16Rik
3 ILMN_1212607 Cradd
4 ILMN_1212610 Zfp626
5 ILMN_1212612 Rcan2
6 ILMN_1212614 Med12l
head(probe_array_address)
probe array_address
1 ILMN_1243094 102690609
2 ILMN_2454720 104050132
3 ILMN_1238674 1240433
4 ILMN_1240307 105420348
5 ILMN_1251139 4670687
6 ILMN_2486639 104210685
symbol_array_address <- merge(x=probe_symbol, y=probe_array_address, by="probe")
head(symbol_array_address)
probe symbol array_address
1 ILMN_1212602 Best1 5270632
2 ILMN_1212605 1500011K16Rik 1850025
3 ILMN_1212607 Cradd 6770520
4 ILMN_1212610 Zfp626 106550193
5 ILMN_1212612 Rcan2 940438
6 ILMN_1212614 Med12l 102510647
my_list <- c('102690609','105420348','104210685','520372','6420059')
#remove the factoring
symbol_array_address$array_address <- levels(droplevels(symbol_array_address$array_address))
subset(symbol_array_address, array_address %in% my_list)
probe symbol array_address
4826 ILMN_1229689 Ammecr1 104210685
22287 ILMN_2664619 Skint10 520372
26920 ILMN_2733155 Fezf1 6420059

This work is licensed under a Creative Commons
Attribution 4.0 International License.
I encountered this today and I think the problem is that the “EVIDENCE” column is different when really, we don’t care about that. Look at unique(tp53_go[,-3]).