Fetching Illumina probe ID information

Check which annotation package you need for your microarray data at http://www.bioconductor.org/packages/release/data/annotation/

The results I'm looking at are from a Illumina HumanHT12v3 BeadChip microarray (illuminaHumanv3.db).

Make sure you're running the latest version of R (R version 2.13.0 (2011-04-13)) and Bioconductor (Bioconductor version 2.8).

source("http://bioconductor.org/biocLite.R")
biocLite("illuminaHumanv3.db")
library("illuminaHumanv3.db")
ids = c("ILMN_1762337", "ILMN_2055271", "ILMN_1736007", "ILMN_2383229", "ILMN_1806310")
mget(ids, illuminaHumanv3GENOMICLOCATION)
#$ILMN_1762337
#[1] "chr7:20180663:20180712:-"
#
#$ILMN_2055271
#[1] "chr19:58856730:58856779:-"
#
#$ILMN_1736007
#[1] "chr19:58857369:58857418:-"
#
#$ILMN_2383229
#[1] "chr10:52566587:52566636:-"
#
#$ILMN_1806310
#[1] "chr10:52566496:52566545:-"

mget(ids, illuminaHumanv3PROBESEQUENCE)
#$ILMN_1762337
#[1] "GTGTTACAAGACCTTCAGTCAGCTTTGGACAGAATGAAAAACCCTGTGAC"
#
#$ILMN_2055271
#[1] "GGGATTACAGGGGTGAGCCACCACGCCCAGCCCCAGCTTAGTTTTTTAAA"
#
#$ILMN_1736007
#[1] "GCAGAGCTGGACGCTGTGGAAATGGCTGGATTCCTCTGTGTTCTTTCCCA"
#
#$ILMN_2383229
#[1] "TGCTGTCCCTAATGCAACTGCACCCGTGTCTGCAGCCCAGCTCAAGCAAG"
#
#$ILMN_1806310
#[1] "GAGGTCTACCCAACTTTTGCAGTGACTGCCCGAGGGGATGGATATGGCAC"
mget(ids, illuminaHumanv3GENENAME)
#$ILMN_1762337
#[1] "metastasis associated in colon cancer 1"
#
#$ILMN_2055271
#[1] NA
#
#$ILMN_1736007
#[1] NA
#
#$ILMN_2383229
#[1] "APOBEC1 complementation factor"
#
#$ILMN_1806310
#[1] "APOBEC1 complementation factor"

Sourced from Bioconductor mailing list thread -> [BioC] [Fwd: help in annotation ILLUMINA probes]

Check whether two genes belong to the same pathway

Updated: 2013 December 16th to include the Reactome database

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")

#load library
library("org.Hs.eg.db")
 
twoGenes <- c("LRRK2","PINK1")
#convert them to Entrez Gene ids
twoGenesEG <- unlist(mget(twoGenes, org.Hs.egSYMBOL2EG))
twoGenesEGkegg <- mget(twoGenesEG, org.Hs.egPATH)
commonKegg <- intersect(twoGenesEGkegg[[1]], twoGenesEGkegg[[2]])
print(commonKegg)
#[1] "05012"
#what are these pathways

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("KEGG.db")

library("KEGG.db")
mget(commonKegg, KEGGPATHID2NAME)
#$`05012`
#[1] "Parkinson's disease"

Source: Bioconductor mailing list: thread [BioC] How to quikly know whether two genes are in the same pathway or not?

Using reactome

If you ran the above code, you should get this warning:

KEGG.db contains mappings based on older data because the original
resource was removed from the the public domain before the most
recent update was produced. This package should now be considered
deprecated and future versions of Bioconductor may not have it
available. Users who want more current data are encouraged to look
at the KEGGREST or reactome.db packages

Here I use the Reactome database instead:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("reactome.db")

#load library
library("reactome.db")

#You can learn what objects this package supports
#with the following command:
ls("package:reactome.db")
# [1] "reactome"              "reactome.db"           "reactome_dbconn"      
# [4] "reactome_dbfile"       "reactome_dbInfo"       "reactome_dbschema"    
# [7] "reactomeEXTID2PATHID"  "reactomeGO2REACTOMEID" "reactomeMAPCOUNTS"    
#[10] "reactomePATHID2EXTID"  "reactomePATHID2NAME"   "reactomePATHNAME2ID"  
#[13] "reactomeREACTOMEID2GO"

#reactomeEXTID2PATHID
#an annotation data object that maps Entrez Gene identifiers
#to Reactome pathway identifiers

#convert to list
xx <- as.list(reactomeEXTID2PATHID)
#check contents of xx
head(xx, 1)
#$`10`
#[1] "211859"  "1430728" "156580"  "156582"

#another way of selecting the Reactome IDs
#find which keytypes can be selected
keytypes(reactome.db)
#[1] "ENTREZID"   "GO"         "PATHNAME"   "PATHID"     "REACTOMEID"

AnnotationDbi::select(reactome.db,
                      keys=10, 
                      keytype="ENTREZID",
                      columns=c("ENTREZID","REACTOMEID")
                     )
#  ENTREZID REACTOMEID
#1       10    1430728
#2       10     156580
#3       10     156582
#4       10     211859

#obtain pathway names
AnnotationDbi::select(reactome.db,
                      keys=10, 
                      keytype="ENTREZID",
                      columns=c("ENTREZID","REACTOMEID","PATHNAME")
                     )
#  ENTREZID REACTOMEID                            PATHNAME
#1       10    1430728            Homo sapiens: Metabolism
#2       10     156580  Homo sapiens: Phase II conjugation
#3       10     156582           Homo sapiens: Acetylation
#4       10     211859 Homo sapiens: Biological oxidations

#two keys
AnnotationDbi::select(reactome.db,
                      keys=c(10, 100), 
                      keytype="ENTREZID",
                      columns=c("ENTREZID","REACTOMEID","PATHNAME")
                     )

#  ENTREZID REACTOMEID                                PATHNAME
#1       10    1430728                Homo sapiens: Metabolism
#2       10     156580      Homo sapiens: Phase II conjugation
#3       10     156582               Homo sapiens: Acetylation
#4       10     211859     Homo sapiens: Biological oxidations
#5      100    1430728                Homo sapiens: Metabolism
#6      100      15869 Homo sapiens: Metabolism of nucleotides
#7      100      73847         Homo sapiens: Purine metabolism
#8      100      74217            Homo sapiens: Purine salvage

#common pathway between Entrez Gene id 10 and 100
first <- xx[grep("^10$", names(xx), perl=T)]
second <- xx[grep("^100$", names(xx), perl=T)]
intersect(first[[1]], second[[1]])
[1] "1430728"

#reactomePATHID2NAME
#An annotation data object that maps
#Reactome pathway identifiers to Reactome pathway names

yy <- as.list(reactomePATHID2NAME)
yy[grep("^1430728$", names(yy), perl=T)]
$`1430728`
[1] "Homo sapiens: Metabolism"

#If you have gene symbols convert them to Entrez Gene ids like so:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
#load library
library("org.Hs.eg.db")
 
twoGenes <- c("LRRK2","PINK1")
#convert gene symbol to Entrez Gene ids
twoGenesEG <- unlist(mget(twoGenes, org.Hs.egSYMBOL2EG))
twoGenesEG
#   LRRK2    PINK1 
#"120892"  "65018"

Visualising FANTOM3 human CAGE data

First download the relevant tables from here. Download tss_library_expr_summary.tsv.bz2, tss_summary.tsv.bz2 and rna_lib_summary.tsv.bz2 and bunzip2 the files.

According to information derived from rna_lib_summary.tsv there are 13,897 CAGE tags from the prostate gland (rna_lib_id = HBA).

Double check this:

cat tss_library_expr_summary.tsv | awk '$3=="HBA" {print}' | cut -f5 | perl -e 'while(<>){ chomp; $i+=$_; } print "$i\n";'
13897

cat tss_library_expr_summary.tsv | awk '$3=="HBA" {print}' | cut -f2 | sort -u | wc -l
7029

Genomic coordinates of TSSs can be obtained from tss_summary.tsv. Here's a script which generates bedgraph files of all the CAGE tags for each tissue. Bugs may be present in the code; use at your own risk. Note I commented out strictures because I wanted to use scalars as filehandles (which is ambiguous code).

#!/usr/bin/perl

#use strict;
use warnings;

my $mapping = 'tss_summary.tsv';
my $tss = 'tss_library_expr_summary.tsv';
my $library = 'rna_lib_summary.tsv';

my %library = ();
open(IN,'<',$library) || die "Could not open $library: $!\n";
while(){
   chomp;
   my ($lib,$junk,$junk2,$junk3,$desc,@junk) = split(/\t/);
   next unless $lib =~ /^H/;
   $library{$lib} = $desc;
}
close(IN);

my %mapping = ();

open(IN,'<',$mapping) || die "Could not open $mapping: $!\n";
while(){
   chomp;
   my ($tss_id,$junk,$junk2,$chr,$junk3,$strand,$start,$end,@junk) = split(/\t/);
   if ($strand eq 'F'){
      $strand = '+';
   } else {
      $strand = '-';
   }
   $mapping{$tss_id}->{'CHR'} = $chr;
   $mapping{$tss_id}->{'STRAND'} = $strand;
   $mapping{$tss_id}->{'START'} = $start;
   $mapping{$tss_id}->{'END'} = $end;
}
close(IN);

foreach my $lib (keys %library){

   my $library = $lib;
   open($lib,'>',"${lib}.bed") || die "Could not open $lib.bed for writing: $!\n";

print $lib <track type=bedGraph name="$library{$library}" description="FANTOM3 CAGE $library{$library}" visibility=full color=0,0,255 altColor=255,0,0 priority=20
EOF

}

open(IN,'<',$tss) || die "Could not open $tss: $!\n";
while(){
   chomp;
   my ($tss_rna_lib_id,$tss_id,$rna_lib_id,$priming,$tss_tags,$tpm) = split(/\t/);
   next unless $rna_lib_id =~ /^H/;
   if (exists $mapping{$tss_id}){
      my $chr = $mapping{$tss_id}->{'CHR'};
      my $strand = $mapping{$tss_id}->{'STRAND'};
      my $start = $mapping{$tss_id}->{'START'};
      my $end = $mapping{$tss_id}->{'END'};
      if ($strand eq '-'){
         $tss_tags = '-' . $tss_tags;
      }
      print $rna_lib_id join("\t",$chr,$start,$end,$tss_tags),"\n";
   } else {
      warn "Missing information for $tss_id\n";
   }
}
close(IN);

exit(0);

This will generate 42 bedgraph files.

Double check:

cat HBA.bed | cut -f4 | sed 's/-//' | perl -e 'while(<>){ chomp; $i+=$_; } print "$i\n";'
13897

Loaded two of the bedgraphs into the UCSC Genome Browser.

The Normal Distribution

Updated 2014 December 19th

The normal or Gaussian distribution is commonly occurring continuous probability distribution. The skewness, which is a measure of symmetry (or there lackof), of a normal distribution is zero since the distribution is symmetrical, i.e. it looks the same to the left and right of the centre. Kurtosis can be used to measure the shape of a normal distribution; a high kurtosis indicates that the normal distribution has a distinct peak near the mean and a low kurtosis indicates a flat distribution.

Continue reading

Extract gene names according to GO terms

Find genes that are associated with a known gene ontology term:

#install if you don't have org.Hs.eg.db and GO.db
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
biocLite("GO.db")
library(org.Hs.eg.db)
library(GO.db)

go_id = GOID( GOTERM[ Term(GOTERM) == "chromatin remodeling"])

get(go_id, org.Hs.egGO2ALLEGS)

     IEA      IEA      IDA      IEA      IEA      IDA      TAS       IC
    "86"    "473"    "664"    "676"   "1105"   "2186"   "2648"   "3065"
      IC      ISS      IEA      ISS      TAS      IDA      IEA      TAS
  "3066"   "3070"   "4221"   "4678"   "5925"   "5928"   "6594"   "6595"
     IDA      IEA      IDA      IDA      IMP      TAS      NAS      NAS
  "6597"   "6598"   "6599"   "6601"   "6602"   "6827"   "6829"   "6830"
     IEA      IDA      TAS      IEA      TAS      IEA      NAS      IEA
  "6927"   "7141"   "7141"   "7270"   "8289"   "8467"   "8850"   "9031"
     IDA      NAS      IDA      TAS      IDA      TAS      IEA      NAS
  "9557"   "9757"   "9759"  "10014"  "10361"  "10629"  "10661"  "11176"
     NAS      IMP      IEA      IDA      IMP      NAS      IDA      NAS
 "11335"  "22893"  "23314"  "23411"  "50511"  "50943"  "51773"  "54108"
     IDA      TAS      IMP      NAS      IMP      IDA      NAS      IEA
 "54617"  "55193"  "55355"  "57492"  "57680"  "79723"  "84181" "150572"
     IMP      NAS
"201161" "373861"

allegs = get(go_id, org.Hs.egGO2ALLEGS)

genes = unlist(mget(allegs,org.Hs.egSYMBOL))

genes
       86       473       664       676      1105      2186      2648      3065
 "ACTL6A"    "RERE"   "BNIP3"    "BRDT"    "CHD1"    "BPTF"   "KAT2A"   "HDAC1"
     3066      3070      4221      4678      5925      5928      6594      6595
  "HDAC2"   "HELLS"    "MEN1"    "NASP"     "RB1"   "RBBP4" "SMARCA1" "SMARCA2"
     6597      6598      6599      6601      6602      6827      6829      6830
"SMARCA4" "SMARCB1" "SMARCC1" "SMARCC2" "SMARCD1" "SUPT4H1"  "SUPT5H"  "SUPT6H"
     6927      7141      7141      7270      8289      8467      8850      9031
  "HNF1A"    "TNP1"    "TNP1"    "TTF1"  "ARID1A" "SMARCA5"   "KAT2B"   "BAZ1B"
     9557      9757      9759     10014     10361     10629     10661     11176
  "CHD1L"    "MLL4"   "HDAC4"   "HDAC5"    "NPM2"   "TAF6L"    "KLF1"   "BAZ2A"
    11335     22893     23314     23411     50511     50943     51773     54108
   "CBX3"   "BAHD1"   "SATB2"   "SIRT1"   "SYCP3"   "FOXP3"    "RSF1"  "CHRAC1"
    54617     55193     55355     57492     57680     79723     84181    150572
  "INO80"   "PBRM1"   "HJURP"  "ARID1B"    "CHD8" "SUV39H2"    "CHD6"   "SMYD1"
   201161    373861
  "CENPV"   "HILS1"

See also: org.Hs.eg.db

Source: bioconductor mailing list thread -> [BioC] Query Gene Ontology