Another post related to this course I'm going through (I can't link it enough times). I have almost finished with the first day of the course and couldn't resist writing about this lecture on using the Bioconductor annotation packages. I had not realised that the annotation packages could be queried (pardon my ignorance) in the same manner as using SQL statements. Please see the lecture for more elaboration.
#install the Bioconductor annotation package if necessary source("http://bioconductor.org/biocLite.R") biocLite(org.Hs.eg.db) #load the human Entrez gene identifiers package library(org.Hs.eg.db) #find out the columns of this annotation database #imagine a SQL table, these are the columns cols(org.Hs.eg.db) [1] "ENTREZID" "PFAM" "IPI" "PROSITE" "ACCNUM" "ALIAS" "CHR" [8] "CHRLOC" "CHRLOCEND" "ENZYME" "MAP" "PATH" "PMID" "REFSEQ" [15] "SYMBOL" "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME" "UNIPROT" [22] "GO" "EVIDENCE" "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM" [29] "UCSCKG" #find out the columns we can use to search the database keytypes(org.Hs.eg.db) [1] "ENTREZID" "PFAM" "IPI" "PROSITE" "ACCNUM" "ALIAS" "CHR" [8] "CHRLOC" "CHRLOCEND" "ENZYME" "MAP" "PATH" "PMID" "REFSEQ" [15] "SYMBOL" "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME" "UNIPROT" [22] "GO" "EVIDENCE" "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM" [29] "UCSCKG" #which chromosome does the gene TP53 reside? select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL", "CHR"), keytype="SYMBOL") SYMBOL CHR 1 TP53 17 #find out the keys for the keytype CHR keys(org.Hs.eg.db, "CHR") [1] "19" "12" "8" "14" "3" "2" "17" "16" "9" "X" "6" "1" "7" "10" "11" "22" "5" "18" "15" "Y" "20" [22] "21" "4" "13" "MT" "Un" #this question was asked in the lecture #How many genes are there on chromosome 22 are in this annotation database? #store all symbols symbol <- keys(org.Hs.eg.db, "SYMBOL") #how many gene symbols length(symbol) [1] 43819 #distribution of gene symbols along the chromosomes symbol_chr <- select(org.Hs.eg.db, keys=symbol, cols=c("CHR","SYMBOL"), keytype="SYMBOL") Warning message: In .generateExtraRows(tab, keys, jointype) : 'select' resulted in 1:many mapping between keys and return rows #the above warning is for duplicated rows #double check how many gene symbols are in symbol_chr length(symbol_chr$SYMBOL) [1] 43868 #unique ones length(unique(symbol_chr$SYMBOL)) [1] 43819 #distribution of symbols on chromosomes table(symbol_chr$CHR) 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 4173 1604 2524 1945 1124 1679 1475 1597 2079 674 2245 2812 1017 546 1029 2306 1702 1887 2432 2307 1586 9 MT Un X Y 1790 37 219 2087 535 plot(table(symbol_chr$CHR), xlab="Chromosomes", ylab="Number of genes")
Yes I should have sorted the chromosomes.
#Find the GO IDs for the TP53 gene tp53_go <- select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","GO"), keytype="SYMBOL") Warning message: In .generateExtraRows(tab, keys, jointype) : 'select' resulted in 1:many mapping between keys and return rows head(tp53_go) SYMBOL GO EVIDENCE ONTOLOGY 1 TP53 GO:0000060 IEA BP 2 TP53 GO:0000075 TAS BP 3 TP53 GO:0000122 IBA BP 4 TP53 GO:0000122 IDA BP 5 TP53 GO:0000122 ISS BP 6 TP53 GO:0000733 IDA BP dim(tp53_go) [1] 136 4 #Hmm that warning message is worrying #especially since each row is unique dim(unique(tp53_go)) [1] 136 4 #let's see what the warning message is about #which GO terms are duplicated? dup <- tp53_go$GO[duplicated(tp53_go$GO)] dup [1] "GO:0000122" "GO:0000122" "GO:0005654" "GO:0005829" "GO:0006915" "GO:0007050" "GO:0030330" "GO:0043066" [9] "GO:0045893" "GO:0045944" "GO:0045944" #make dup a dataframe dup_df <- data.frame(GO=dup) #merge tp53_go with dup_df merge(x=tp53_go, y=dup_df, by="GO") GO SYMBOL EVIDENCE ONTOLOGY 1 GO:0000122 TP53 ISS BP 2 GO:0000122 TP53 ISS BP 3 GO:0000122 TP53 IBA BP 4 GO:0000122 TP53 IBA BP 5 GO:0000122 TP53 IDA BP 6 GO:0000122 TP53 IDA BP 7 GO:0005654 TP53 IDA CC 8 GO:0005654 TP53 TAS CC 9 GO:0005829 TP53 IDA CC 10 GO:0005829 TP53 IBA CC 11 GO:0006915 TP53 IDA BP 12 GO:0006915 TP53 IMP BP 13 GO:0007050 TP53 IMP BP 14 GO:0007050 TP53 IDA BP 15 GO:0030330 TP53 IDA BP 16 GO:0030330 TP53 IMP BP 17 GO:0043066 TP53 IDA BP 18 GO:0043066 TP53 IEA BP 19 GO:0045893 TP53 IDA BP 20 GO:0045893 TP53 IMP BP 21 GO:0045944 TP53 IDA BP 22 GO:0045944 TP53 IDA BP 23 GO:0045944 TP53 IGI BP 24 GO:0045944 TP53 IGI BP 25 GO:0045944 TP53 IMP BP 26 GO:0045944 TP53 IMP BP #dup_df has duplicated GO terms hence merge returned duplicated rows #which you can see below dup_df GO 1 GO:0000122 2 GO:0000122 3 GO:0005654 4 GO:0005829 5 GO:0006915 6 GO:0007050 7 GO:0030330 8 GO:0043066 9 GO:0045893 10 GO:0045944 11 GO:0045944 #So perhaps we got a warning because of the duplicated GO terms #which are there because those GO terms were supported by different evidence #let's try use biomaRt to compare results #install if necessary source("http://bioconductor.org/biocLite.R") biocLite("biomaRt") #for more informatoin on biomaRt see my older post http://davetang.org/muse/2012/04/27/learning-to-use-biomart/ library("biomaRt") ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl") #find out the Ensembl id of TP53 select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","ENSEMBL"), keytype="SYMBOL") SYMBOL ENSEMBL 1 TP53 ENSG00000141510 #search GO terms associated with ENSG00000141510 getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl) tp53_go_biomart <- getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl) length(tp53_go_biomart) [1] 135 #order the vector tp53_go_biomart <- tp53_go_biomart[order(tp53_go_biomart)] #create comparison vector of GO terms collected from org.Hs.eg.db comparison <- tp53_go$GO comparison <- comparison[order(comparison)] #how many are exclusive of each other setdiff(tp53_go_biomart,comparison) [1] "GO:0000739" "GO:0005626" "GO:0005678" "GO:0006351" "GO:0006979" "GO:0007049" "GO:0008629" "GO:0009411" [9] "GO:0009792" "GO:0042127" "GO:0042493" "GO:0043523" "GO:0044419" "GO:0046872" "GO:0051726" "GO:0070215" #how many overlap length(intersect(tp53_go_biomart,comparison)) [1] 119 #what if I search the annotation package #using the Ensembl id tp53_go_2 <- select(org.Hs.eg.db, keys="ENSG00000141510", cols=c("ENSEMBL","GO"), keytype="ENSEMBL") Warning message: In .generateExtraRows(tab, keys, jointype) : 'select' resulted in 1:many mapping between keys and return rows setdiff(tp53_go_2$GO,tp53_go$GO) character(0)
I am not sure of the differences between the GO terms fetched using the Bioconductor annotation package and fetched using biomaRt. I initially thought it was because I was searching using the gene symbols but a search using the Ensembl ID returned the same result. As this post was a demonstration of how to query the Bioconductor annotation packages, I didn't delve into this inconsistency.
In conclusion, I think the Bioconductor annotation packages provide a very valuable resource with many useful annotation packages, especially if you're working with microarrays. And I just found out that the FANTOM4 set of promoters are also there.
What about microarray data?
Here's an example of going from Illumina array address ids (Array_Address_Id) to gene symbols for the illuminaMousev1p1 chip.
From the manual:
illuminaMousev1p1SYMBOL - Map between Manufacturer Identifiers and Gene Symbols
illuminaMousev1p1ARRAYADDRESS - Array Address code used to identify the probe at the bead-level
#install if necessary source("http://bioconductor.org/biocLite.R") biocLite("illuminaMousev1p1.db") library("illuminaMousev1p1.db") gs <- illuminaMousev1p1SYMBOL aa <- illuminaMousev1p1ARRAYADDRESS gs_probe <- mappedkeys(gs) aa_probe <- mappedkeys(aa) ?mappedkeys #Methods for manipulating the keys of a Bimap object head(gs_probe) [1] "ILMN_1212602" "ILMN_1212605" "ILMN_1212607" "ILMN_1212610" "ILMN_1212612" "ILMN_1212614" head(aa_probe) [1] "ILMN_1243094" "ILMN_2454720" "ILMN_1238674" "ILMN_1240307" "ILMN_1251139" "ILMN_2486639" gs_probe_lookup <- as.list(gs[gs_probe]) aa_probe_lookup <- as.list(aa[aa_probe]) head(gs_probe_lookup) $ILMN_1212602 [1] "Best1" $ILMN_1212605 [1] "1500011K16Rik" $ILMN_1212607 [1] "Cradd" $ILMN_1212610 [1] "Zfp626" $ILMN_1212612 [1] "Rcan2" $ILMN_1212614 [1] "Med12l" head(aa_probe_lookup) $ILMN_1243094 [1] "102690609" $ILMN_2454720 [1] "104050132" $ILMN_1238674 [1] "1240433" $ILMN_1240307 [1] "105420348" $ILMN_1251139 [1] "4670687" $ILMN_2486639 [1] "104210685" probe_symbol <- data.frame(probe=names(gs_probe_lookup),symbol=unlist(gs_probe_lookup, use.names=F)) probe_array_address <- data.frame(probe=names(aa_probe_lookup),array_address=unlist(aa_probe_lookup, use.names=F)) head(probe_symbol) probe symbol 1 ILMN_1212602 Best1 2 ILMN_1212605 1500011K16Rik 3 ILMN_1212607 Cradd 4 ILMN_1212610 Zfp626 5 ILMN_1212612 Rcan2 6 ILMN_1212614 Med12l head(probe_array_address) probe array_address 1 ILMN_1243094 102690609 2 ILMN_2454720 104050132 3 ILMN_1238674 1240433 4 ILMN_1240307 105420348 5 ILMN_1251139 4670687 6 ILMN_2486639 104210685 symbol_array_address <- merge(x=probe_symbol, y=probe_array_address, by="probe") head(symbol_array_address) probe symbol array_address 1 ILMN_1212602 Best1 5270632 2 ILMN_1212605 1500011K16Rik 1850025 3 ILMN_1212607 Cradd 6770520 4 ILMN_1212610 Zfp626 106550193 5 ILMN_1212612 Rcan2 940438 6 ILMN_1212614 Med12l 102510647 my_list <- c('102690609','105420348','104210685','520372','6420059') #remove the factoring symbol_array_address$array_address <- levels(droplevels(symbol_array_address$array_address)) subset(symbol_array_address, array_address %in% my_list) probe symbol array_address 4826 ILMN_1229689 Ammecr1 104210685 22287 ILMN_2664619 Skint10 520372 26920 ILMN_2733155 Fezf1 6420059
This work is licensed under a Creative Commons
Attribution 4.0 International License.
I encountered this today and I think the problem is that the “EVIDENCE” column is different when really, we don’t care about that. Look at unique(tp53_go[,-3]).