A post on linking OMIM IDs to gene coordinates using biomaRt; this provides a way of representing OMIM IDs on the genome. For those unfamiliar with OMIM, here's the description from the OMIM FAQ:
Online Mendelian Inheritance in Man (OMIM) is a continuously updated catalog of human genes and genetic disorders and traits, with particular focus on the molecular relationship between genetic variation and phenotypic expression.
Firstly, register yourself for the downloads; you will get an email almost instantaneously, which provides a link where you can download these files:
ls -1 genemap genemap.key genemap2.txt mim2gene.txt morbidmap omim.txt.Z
The mim2gene.txt file associates OMIM IDs to HUGO gene symbols, Entrez and Ensembl gene IDs. Let's take a look:
#these are the column headers
cat mim2gene.txt | head -1
# MIM Number Type Gene ID Approved Gene Symbol Ensembl Gene ID
#how many OMIM IDs are in this file
cat mim2gene.txt | grep -v "^#" | cut -f1 | wc -l
23999
#how many unique OMIM IDs
cat mim2gene.txt | grep -v "^#" | cut -f1 | sort -u | wc -l
23999
#output lines where there is an Ensembl ID
cat mim2gene.txt | grep -v "^#" | awk '$5 !~ "-" {print}' | head -5
100640 gene 216 ALDH1A1 ENSG00000165092
100650 gene/phenotype 217 ALDH2 ENSG00000111275
100660 gene 218 ALDH3A1 ENSG00000108602
100670 gene 219 ALDH1B1 ENSG00000137124
100678 gene 39 ACAT2 ENSG00000120437
#how many lines have Ensembl IDs
cat mim2gene.txt | grep -v "^#" | awk '$5 !~ "-" {FS="\t"; print}' | wc -l
14552
#save
cat mim2gene.txt | grep -v "^#" | awk '$5 !~ "-" {FS="\t"; print}' | gzip > omim_to_ensembl.tsv.gz
To obtain the gene coordinates of the Ensembl gene IDs, we will use biomaRt. Firstly we need to load the data into R:
#load data into R
data <- read.table("omim_to_ensembl.tsv.gz",
header=F,
stringsAsFactors=F,
sep="\t")
#check the dimensions of data
dim(data)
[1] 14552 5
#name the columns
# MIM Number Type Gene ID Approved Gene Symbol Ensembl Gene ID
colnames(data) <- c('omim',
'type',
'entrez_gene_id',
'hgnc_symbol',
'ensembl_gene_id')
#check out data
head(data)
omim type entrez_gene_id hgnc_symbol ensembl_gene_id
1 100640 gene 216 ALDH1A1 ENSG00000165092
2 100650 gene/phenotype 217 ALDH2 ENSG00000111275
3 100660 gene 218 ALDH3A1 ENSG00000108602
4 100670 gene 219 ALDH1B1 ENSG00000137124
5 100678 gene 39 ACAT2 ENSG00000120437
6 100690 gene 1134 CHRNA1 ENSG00000138435
Now that the data is loaded into R, we can use biomaRt.
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
#load the package
library("biomaRt")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
filters <- listFilters(ensembl)
#these are the filters we can use
#to limit our biomaRt query
head(filters)
name description
1 chromosome_name Chromosome name
2 start Gene Start (bp)
3 end Gene End (bp)
4 band_start Band Start
5 band_end Band End
6 marker_start Marker Start
#search for the filters that contain
#the word "ensembl"
grep(pattern = 'ensembl',
x = filters$name,
ignore.case = T,
value = T)
[1] "with_ox_clone_based_ensembl_gene" "with_ox_clone_based_ensembl_transcript"
[3] "ensembl_gene_id" "ensembl_transcript_id"
[5] "ensembl_peptide_id" "ensembl_exon_id"
[7] "clone_based_ensembl_gene_name" "clone_based_ensembl_transcript_name"
#the attributes are the attributes
#that we want retured
attributes <- listAttributes(ensembl)
head(attributes, 10)
name description
1 ensembl_gene_id Ensembl Gene ID
2 ensembl_transcript_id Ensembl Transcript ID
3 ensembl_peptide_id Ensembl Protein ID
4 ensembl_exon_id Ensembl Exon ID
5 description Description
6 chromosome_name Chromosome Name
7 start_position Gene Start (bp)
8 end_position Gene End (bp)
9 strand Strand
10 band Band
#create a vector that contains the Ensembl IDs
my_value <- data$ensembl_gene_id
#how many entries?
length(my_value)
[1] 14552
#how many unique entries?
length(unique(my_value))
[1] 14419
#store only the unique entries
my_value <- unique(my_value)
#now we can perform the biomaRt query
gene_info <- getBM(attributes= c('chromosome_name',
'start_position',
'end_position',
'ensembl_gene_id',
'strand'),
filters = "ensembl_gene_id",
value = my_value,
mart = ensembl)
#here's the results from our query
head(gene_info)
chromosome_name start_position end_position ensembl_gene_id strand
1 X 100627109 100639991 ENSG00000000003 -1
2 X 100584802 100599885 ENSG00000000005 1
3 20 50934867 50958555 ENSG00000000419 -1
4 1 169849631 169894267 ENSG00000000457 -1
5 1 27612064 27635277 ENSG00000000938 -1
6 1 196651878 196747504 ENSG00000000971 1
#check the dimension of our results
#it matched the length of the my_value vector
#meaning that it could find coordinates for
#all the Ensembl gene IDs
dim(gene_info)
[1] 14419 5
Lastly, we need to connect the gene coordinates to our the OMIM to gene table.
#use the merge function to connect the data frames
merged_data <- merge(x = data,
y = gene_info,
by = 'ensembl_gene_id')
#check it out
head(merged_data)
ensembl_gene_id omim type entrez_gene_id hgnc_symbol chromosome_name start_position end_position strand
1 ENSG00000000003 300191 gene 7105 TSPAN6 X 100627109 100639991 -1
2 ENSG00000000005 300459 gene 64102 TNMD X 100584802 100599885 1
3 ENSG00000000419 603503 gene 8813 DPM1 20 50934867 50958555 -1
4 ENSG00000000457 608192 gene 57147 SCYL3 1 169849631 169894267 -1
5 ENSG00000000938 164940 gene 2268 FGR 1 27612064 27635277 -1
6 ENSG00000000971 134370 gene 3075 CFH 1 196651878 196747504 1
#create a table that resembles a BED file
my_bed <- data.frame(chr=merged_data$chromosome_name,
start=merged_data$start_position,
end=merged_data$end_position,
id=paste(merged_data$ensembl_gene_id, merged_data$omim,sep="/"),
score=rep(0,nrow(merged_data)),
strand=merged_data$strand)
head(my_bed)
chr start end id score strand
1 X 100627109 100639991 ENSG00000000003/300191 0 -1
2 X 100584802 100599885 ENSG00000000005/300459 0 1
3 20 50934867 50958555 ENSG00000000419/603503 0 -1
4 1 169849631 169894267 ENSG00000000457/608192 0 -1
5 1 27612064 27635277 ENSG00000000938/164940 0 -1
6 1 196651878 196747504 ENSG00000000971/134370 0 1
#perform some cosmetics
my_bed$strand <- gsub(pattern = "-1",replacement = "-",x = my_bed$strand)
my_bed$strand <- gsub(pattern = "1",replacement = "+",x = my_bed$strand)
my_bed <- my_bed[order(my_bed[,'chr'],my_bed[,'start']),]
my_bed$chr <- paste('chr', my_bed$chr, sep = '')
head(my_bed)
chr start end id score strand
12296 chr1 944204 959309 ENSG00000188976/610770 0 -
12156 chr1 975204 982093 ENSG00000187642/615921 0 -
12219 chr1 998962 1000172 ENSG00000188290/608060 0 -
12149 chr1 1001138 1014541 ENSG00000187608/147571 0 +
12206 chr1 1020123 1056118 ENSG00000188157/103320 0 +
13248 chr1 1167092 1167202 ENSG00000207730/612091 0 +
write.table(x = my_bed,
file = 'omim_to_ensembl.bed',
quote = F,
sep = "\t",
row.names = F,
col.names = F)
Check out the BED file
head omim_to_ensembl.bed chr1 944204 959309 ENSG00000188976/610770 0 - chr1 975204 982093 ENSG00000187642/615921 0 - chr1 998962 1000172 ENSG00000188290/608060 0 - chr1 1001138 1014541 ENSG00000187608/147571 0 + chr1 1020123 1056118 ENSG00000188157/103320 0 + chr1 1167092 1167202 ENSG00000207730/612091 0 + chr1 1167846 1167956 ENSG00000207607/612090 0 + chr1 1169005 1169087 ENSG00000198976/612094 0 + chr1 1203508 1206691 ENSG00000186891/603905 0 - chr1 1211326 1214138 ENSG00000186827/600315 0 - gzip omim_to_ensembl.bed

This work is licensed under a Creative Commons
Attribution 4.0 International License.
Excellent post! A couple of questions:
– Is there an automated way to connect OMIM IDs with disease names?
– And, from there, output sets of disease-specific genes into disease-specific files, like ‘migraine.bed’?
And, a couple of cosmetics:
– Code chunk “Now that the data is loaded into R, we can use biomaRt.”, line 32. Adding ‘attributes <- listAttributes(ensembl)' will make the following code working
– Code chunk "Lastly, we need to connect the gene coordinates to our the OMIM to gene table." – lines 24 and 25 are duplicated
Thanks for going through the code! I believe the answer may lie at the OMIM API (http://www.omim.org/help/api), which I haven’t gone through. My next post!