Using the Bioconductor annotation packages

Another post related to this course I'm going through (I can't link it enough times). I have almost finished with the first day of the course and couldn't resist writing about this lecture on using the Bioconductor annotation packages. I had not realised that the annotation packages could be queried (pardon my ignorance) in the same manner as using SQL statements. Please see the lecture for more elaboration.

#install the Bioconductor annotation package if necessary
source("http://bioconductor.org/biocLite.R")
biocLite(org.Hs.eg.db)

#load the human Entrez gene identifiers package
library(org.Hs.eg.db)

#find out the columns of this annotation database
#imagine a SQL table, these are the columns
cols(org.Hs.eg.db)
 [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"       "ALIAS"        "CHR"         
 [8] "CHRLOC"       "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"         "PMID"         "REFSEQ"      
[15] "SYMBOL"       "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"     "UNIPROT"     
[22] "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"        
[29] "UCSCKG"

#find out the columns we can use to search the database
keytypes(org.Hs.eg.db)
 [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"      "ACCNUM"       "ALIAS"        "CHR"         
 [8] "CHRLOC"       "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"         "PMID"         "REFSEQ"      
[15] "SYMBOL"       "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"     "UNIPROT"     
[22] "GO"           "EVIDENCE"     "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "OMIM"        
[29] "UCSCKG"

#which chromosome does the gene TP53 reside?
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL", "CHR"), keytype="SYMBOL")
  SYMBOL CHR
1   TP53  17

#find out the keys for the keytype CHR
keys(org.Hs.eg.db, "CHR")
 [1] "19" "12" "8"  "14" "3"  "2"  "17" "16" "9"  "X"  "6"  "1"  "7"  "10" "11" "22" "5"  "18" "15" "Y"  "20"
[22] "21" "4"  "13" "MT" "Un"

#this question was asked in the lecture
#How many genes are there on chromosome 22 are in this annotation database?
#store all symbols
symbol <- keys(org.Hs.eg.db, "SYMBOL")
#how many gene symbols
length(symbol)
[1] 43819
#distribution of gene symbols along the chromosomes
symbol_chr <- select(org.Hs.eg.db, keys=symbol, cols=c("CHR","SYMBOL"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows
#the above warning is for duplicated rows
#double check how many gene symbols are in symbol_chr
length(symbol_chr$SYMBOL)
[1] 43868
#unique ones
length(unique(symbol_chr$SYMBOL))
[1] 43819
#distribution of symbols on chromosomes
table(symbol_chr$CHR)

   1   10   11   12   13   14   15   16   17   18   19    2   20   21   22    3    4    5    6    7    8 
4173 1604 2524 1945 1124 1679 1475 1597 2079  674 2245 2812 1017  546 1029 2306 1702 1887 2432 2307 1586 
   9   MT   Un    X    Y 
1790   37  219 2087  535

plot(table(symbol_chr$CHR), xlab="Chromosomes", ylab="Number of genes")

org_hs_eg_db_chr_geneYes I should have sorted the chromosomes.


#Find the GO IDs for the TP53 gene
tp53_go <- select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","GO"), keytype="SYMBOL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows
head(tp53_go)
  SYMBOL         GO EVIDENCE ONTOLOGY
1   TP53 GO:0000060      IEA       BP
2   TP53 GO:0000075      TAS       BP
3   TP53 GO:0000122      IBA       BP
4   TP53 GO:0000122      IDA       BP
5   TP53 GO:0000122      ISS       BP
6   TP53 GO:0000733      IDA       BP
dim(tp53_go)
[1] 136   4
#Hmm that warning message is worrying
#especially since each row is unique
dim(unique(tp53_go))
[1] 136   4

#let's see what the warning message is about
#which GO terms are duplicated?
dup <- tp53_go$GO[duplicated(tp53_go$GO)]
dup
 [1] "GO:0000122" "GO:0000122" "GO:0005654" "GO:0005829" "GO:0006915" "GO:0007050" "GO:0030330" "GO:0043066"
 [9] "GO:0045893" "GO:0045944" "GO:0045944"
#make dup a dataframe
dup_df <- data.frame(GO=dup)
#merge tp53_go with dup_df
merge(x=tp53_go, y=dup_df, by="GO")
           GO SYMBOL EVIDENCE ONTOLOGY
1  GO:0000122   TP53      ISS       BP
2  GO:0000122   TP53      ISS       BP
3  GO:0000122   TP53      IBA       BP
4  GO:0000122   TP53      IBA       BP
5  GO:0000122   TP53      IDA       BP
6  GO:0000122   TP53      IDA       BP
7  GO:0005654   TP53      IDA       CC
8  GO:0005654   TP53      TAS       CC
9  GO:0005829   TP53      IDA       CC
10 GO:0005829   TP53      IBA       CC
11 GO:0006915   TP53      IDA       BP
12 GO:0006915   TP53      IMP       BP
13 GO:0007050   TP53      IMP       BP
14 GO:0007050   TP53      IDA       BP
15 GO:0030330   TP53      IDA       BP
16 GO:0030330   TP53      IMP       BP
17 GO:0043066   TP53      IDA       BP
18 GO:0043066   TP53      IEA       BP
19 GO:0045893   TP53      IDA       BP
20 GO:0045893   TP53      IMP       BP
21 GO:0045944   TP53      IDA       BP
22 GO:0045944   TP53      IDA       BP
23 GO:0045944   TP53      IGI       BP
24 GO:0045944   TP53      IGI       BP
25 GO:0045944   TP53      IMP       BP
26 GO:0045944   TP53      IMP       BP

#dup_df has duplicated GO terms hence merge returned duplicated rows
#which you can see below
dup_df
           GO
1  GO:0000122
2  GO:0000122
3  GO:0005654
4  GO:0005829
5  GO:0006915
6  GO:0007050
7  GO:0030330
8  GO:0043066
9  GO:0045893
10 GO:0045944
11 GO:0045944

#So perhaps we got a warning because of the duplicated GO terms
#which are there because those GO terms were supported by different evidence

#let's try use biomaRt to compare results
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")

#for more informatoin on biomaRt see my older post http://davetang.org/muse/2012/04/27/learning-to-use-biomart/
library("biomaRt")
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#find out the Ensembl id of TP53
select(org.Hs.eg.db, keys="TP53", cols=c("SYMBOL","ENSEMBL"), keytype="SYMBOL")
  SYMBOL         ENSEMBL
1   TP53 ENSG00000141510

#search GO terms associated with ENSG00000141510
getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)
tp53_go_biomart <- getBM(attributes="go_id", filters="ensembl_gene_id", values = 'ENSG00000141510', mart = ensembl)

length(tp53_go_biomart)
[1] 135

#order the vector
tp53_go_biomart <- tp53_go_biomart[order(tp53_go_biomart)]

#create comparison vector of GO terms collected from org.Hs.eg.db
comparison <- tp53_go$GO
comparison <- comparison[order(comparison)]

#how many are exclusive of each other
setdiff(tp53_go_biomart,comparison)
 [1] "GO:0000739" "GO:0005626" "GO:0005678" "GO:0006351" "GO:0006979" "GO:0007049" "GO:0008629" "GO:0009411"
 [9] "GO:0009792" "GO:0042127" "GO:0042493" "GO:0043523" "GO:0044419" "GO:0046872" "GO:0051726" "GO:0070215"

#how many overlap
length(intersect(tp53_go_biomart,comparison))
[1] 119

#what if I search the annotation package
#using the Ensembl id

tp53_go_2 <- select(org.Hs.eg.db, keys="ENSG00000141510", cols=c("ENSEMBL","GO"), keytype="ENSEMBL")
Warning message:
In .generateExtraRows(tab, keys, jointype) :
  'select' resulted in 1:many mapping between keys and return rows

setdiff(tp53_go_2$GO,tp53_go$GO)
character(0)

I am not sure of the differences between the GO terms fetched using the Bioconductor annotation package and fetched using biomaRt. I initially thought it was because I was searching using the gene symbols but a search using the Ensembl ID returned the same result. As this post was a demonstration of how to query the Bioconductor annotation packages, I didn't delve into this inconsistency.

In conclusion, I think the Bioconductor annotation packages provide a very valuable resource with many useful annotation packages, especially if you're working with microarrays. And I just found out that the FANTOM4 set of promoters are also there.

Posted in R | Tagged , | Leave a comment

Using aggregate and apply in R

I recently came across a course on data analysis and visualisation and now I'm gradually going through each lecture. I just finished following the second lecture and the section "Working with dataframes and vectors efficiently" introduced to me the function called aggregate, which I can see as being extremely useful. In this post, I will write about aggregate, apply, lapply and sapply, which were also introduced in the lecture.

To get started, first download the chicken weights comma delimited file. This dataset is actually already available in the R datasets (see ?ChickWeight) and preformatted. Since we don't always (almost never) get data conveniently formatted for us, I will work with the csv file available at the above link.

#load data
data <- read.table("chick_weight.csv", header=T, sep=",")
head(data)
  weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1
#dimension of the data
dim(data)
[1] 578   4
#how many chickens
unique(data$Chick)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
[36] 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#how many diets
unique(data$Diet)
[1] 1 2 3 4
#how many time points
unique(data$Time)
 [1]  0  2  4  6  8 10 12 14 16 18 20 21
library(ggplot2)
ggplot(data=data, aes(x=Time, y=weight, group=Chick, colour=Chick)) + geom_line() + geom_point()

chick_time_weight_lineOver time the chickens got heavier.

#actually the line plot above
#treated the individual chickens as an interger
#whereas the numbers were used as identifiers
#coerce the intergers into characters
data$Chick <- as.character(data$Chick)
head(data$Chick)
[1] "1" "1" "1" "1" "1" "1"
ggplot(data=data, aes(x=Time, y=weight, group=Chick, colour=Chick)) + geom_line() + geom_point()

chick_time_weight_line_as_char

Now to use aggregate

#find the mean weight depending on diet
#use aggregate
aggregate(data$weight, list(diet = data$Diet), mean)
  diet        x
1    1 102.6455
2    2 122.6167
3    3 142.9500
4    4 135.2627

#aggregate on time
aggregate(data$weight, list(time=data$Time), mean)
   time         x
1     0  41.06000
2     2  49.22000
3     4  59.95918
4     6  74.30612
5     8  91.24490
6    10 107.83673
7    12 129.24490
8    14 143.81250
9    16 168.08511
10   18 190.19149
11   20 209.71739
12   21 218.68889

#use a different function
aggregate(data$weight, list(time=data$Time), sd)
   time         x
1     0  1.132272
2     2  3.688316
3     4  4.495179
4     6  9.012038
5     8 16.239780
6    10 23.987277
7    12 34.119600
8    14 38.300412
9    16 46.904079
10   18 57.394757
11   20 66.511708
12   21 71.510273

#we could also aggregate on time and diet
head(aggregate(data$weight, list(time = data$Time, diet = data$Diet), mean))
  time diet        x
1    0    1 41.40000
2    2    1 47.25000
3    4    1 56.47368
4    6    1 66.78947
5    8    1 79.68421
6   10    1 93.05263
tail(aggregate(data$weight, list(time = data$Time, diet = data$Diet), mean))
   time diet        x
43   12    4 151.4000
44   14    4 161.8000
45   16    4 182.0000
46   18    4 202.9000
47   20    4 233.8889
48   21    4 238.5556
#to see the weights over time across different diets
ggplot(data) + geom_line(aes(x=Time, y=weight, colour=Chick)) + facet_wrap(~Diet) + guides(col=guide_legend(ncol=3))

weight_time_diet

Now there's this very informative post on using apply in R. However, I tend to forget which specific apply function to use. In lecture 2 of the course, apply was introduced, and to reinforce my own understanding I'll provide the examples here.


?apply
#apply functions over array margins
#apply(X, MARGIN, FUN, ...)
#make up some dataframe
df <- data.frame(first = c(1:10), second = c(11:20))
df
   first second
1      1     11
2      2     12
3      3     13
4      4     14
5      5     15
6      6     16
7      7     17
8      8     18
9      9     19
10    10     20
#2 is for columns
apply(df, 2, mean)
 first second 
   5.5   15.5
#1 is for rows
apply(df, 1, mean)
 [1]  6  7  8  9 10 11 12 13 14 15

#write function to sample 10 numbers
#from a Poisson distribution according to lambda
f <- function(l){
   rpois(10, l)
}
f(10)
 [1] 10  3 14 13 13 12  8  8 13 12

#lapply = apply a function over a list
#for reproducibility
set.seed(123)
#save into draws
draws <- lapply(1:5,f)
draws
[[1]]
 [1] 0 2 1 2 3 0 1 2 1 1

[[2]]
 [1] 5 2 3 2 0 4 1 0 1 5

[[3]]
 [1] 5 4 3 8 4 4 3 3 2 1

[[4]]
 [1] 8 7 5 6 1 4 5 2 3 2

[[5]]
 [1] 3 4 4 4 3 3 3 5 4 7

sapply(draws, mean)
[1] 1.3 2.3 3.7 4.3 4.0
#difference with lapply?
#lapply always returns a list. sapply (if it can) simplifies the results
lapply(draws,mean)
[[1]]
[1] 1.3

[[2]]
[1] 2.3

[[3]]
[1] 3.7

[[4]]
[1] 4.3

[[5]]
[1] 4
#get same result as sapply
unlist(lapply(draws,mean))
[1] 1.3 2.3 3.7 4.3 4.0

I'm only onto the third lecture and have already picked up some cool tricks. Here is the link to the course again if you want to follow it.

Posted in R | Tagged | Leave a comment

Creating plots using the xkcd package in R

xkcd styled graphs using the xkcd package in R. Steps done on R version 3.0.1 (2013-05-16) and on Windows, i386-w64-mingw32/i386 (32-bit). Steps followed are from the xkcd-intro.pdf file i.e. the xkcd vignette.

install.packages("xkcd")
library(xkcd)
library(extrafont)
library(ggplot2)

#do you have xkcd fonts?
if( "xkcd" %in% fonts()) {
   p <- ggplot() + geom_point(aes(x=mpg, y=wt), data=mtcars) +
   theme(text = element_text(size = 16, family = "xkcd"))
} else {
   warning("Not xkcd fonts installed!")
   p <- ggplot() + geom_point(aes(x=mpg, y=wt), data=mtcars)
}
#Warning message:
#Not xkcd fonts installed!

#download the font and install it
#http://simonsoftware.se/other/xkcd.ttf
#then import all fonts
font_import()
#load all fonts
loadfonts()
#Despite loading all fonts
#I had to restart R to get the fonts loaded properly

#opens the xkcd-intro.pdf
vignette("xkcd-intro")

xrange <- range(mtcars$mpg)
yrange <- range(mtcars$wt)
set.seed(123) # for reproducibility
p <- ggplot() + geom_point(aes(mpg, wt), data=mtcars) + xkcdaxis(xrange,yrange)
p

mtcars_scatterplot

Cartoon characters

To include cartoon characters in the graph, use the xkcdman() function.

ratioxy <- diff(xrange)/diff(yrange)
mapping <- aes(x, y, scale, ratioxy, angleofspine ,anglerighthumerus, anglelefthumerus, anglerightradius, angleleftradius, anglerightleg, angleleftleg, angleofneck, linetype=city)
dataman <- data.frame(x= c(15,30), y=c(3, 4), scale = c(0.3,0.51), ratioxy = ratioxy, angleofspine = -pi/2, anglerighthumerus = c(pi/4, -pi/6), anglelefthumerus = c(pi/2 + pi/4, pi +pi/6), anglerightradius = c(pi/3, -pi/3), angleleftradius = c(pi/3, -pi/3), anglerightleg = 3*pi/2 - pi / 12, angleleftleg = 3*pi/2 + pi / 12, angleofneck = runif(1, 3*pi/2-pi/10, 3*pi/2+pi/10), city=c("Liliput","Brobdingnag"))
q <- ggplot() + geom_point(aes(mpg, wt, colour=as.character(vs)), data=mtcars) + xkcdaxis(xrange,yrange) + xkcdman(mapping, dataman)

xkcdman

Scatterplot


volunteers <- data.frame(year=c(2007:2011), number=c(56470, 56998, 59686, 61783, 64251))
xrange <- range(volunteers$year)
yrange <- range(volunteers$number)
ratioxy <- diff(xrange) / diff(yrange)
datalines <- data.frame(xbegin=c(2008.3,2010.5),ybegin=c(63000,59600), xend=c(2008.5,2010.3), yend=c(63400,59000))
p <- ggplot() + geom_smooth(mapping=aes(x=year, y =number), data =volunteers, method="loess") + xkcdaxis(xrange,yrange) + ylab("Volunteers at Caritas Spain")
p

volunteer

Bar charts


data <- data.frame(year=c(2007:2011), number=c(56470, 56998, 59686, 61783, 64251))
data$xmin <- data$year - 0.1
data$xmax <- data$year + 0.1
data$ymin <- 50000
data$ymax <- data$number
xrange <- range(min(data$xmin)-0.1, max(data$xmax) + 0.1)
yrange <- range(min(data$ymin)+500, max(data$ymax) + 1000)
mapping <- aes(xmin=xmin,ymin=ymin,xmax=xmax,ymax=ymax)
p <- ggplot() + xkcdrect(mapping,data) + xkcdaxis(xrange,yrange) + xlab("Year") + ylab("Volunteers at Caritas Spain")
p

barchart

Barchart of piRNA lengths from NONCODE

Previously I had used the SeqinR package in R to get the length distribution of all human piRNAs in the NONCODE database. Here a produce a xkcd styled bar chart of the piRNA lengths.


data <- data.frame(size = c(26,27,28,29,30,31,32), freq = c(3552, 3540, 4127, 7166, 8216, 4410, 1141))
data
  size freq
1   26 3552
2   27 3540
3   28 4127
4   29 7166
5   30 8216
6   31 4410
7   32 1141

data$xmin <- data$size - 0.1
data$xmax <- data$size + 0.1
data$ymin <- 0
data$ymax <- data$freq
xrange <- range(min(data$xmin)-0.1, max(data$xmax) + 0.1)
yrange <- range(min(data$ymin)+500, max(data$ymax) + 1000)
mapping <- aes(xmin=xmin,ymin=ymin,xmax=xmax,ymax=ymax)
p <- ggplot() + xkcdrect(mapping,data) + xkcdaxis(xrange,yrange) + xlab("piRNA size") + ylab("Frequency")
p

pirna_size

See more

More information on the xkcd package in R

Posted in fun | Tagged , | Leave a comment

Singular Vector Decomposition using R

In linear algebra terms, a Singular Vector Decomposition (SVD) is the decomposition of a matrix X into three matrices, each having special properties. If X is a matrix with each variable in a column and each observation in a row then the SVD is

X = UDV^T

where the columns of U are orthogonal (left singular vectors), the columns of V are orthogonal (right singluar vectors) and D is a diagonal matrix (singular values). Here I perform a SVD on the iris dataset in R.

#use the iris dataset
#for more info type ?iris
names(iris)
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

#perform hierarchical clustering
h <- hclust(dist(iris[,c(1:4)]))

#inside the h object
names(h)
[1] "merge"       "height"      "order"       "labels"      "method"      "call"       
[7] "dist.method"

#order corresponds to the hierarchical clustering order on iris rows
h$order
  [1] 108 131 103 126 130 119 106 123 118 132 110 136 141 145 125 121 144 101 137 149 116 111 148 113 140
 [26] 142 146 109 104 117 138 105 129 133 150  71 128 139 115 122 114 102 143 135 112 147 124 127  73  84
 [51] 134 120  69  88  66  76  77  55  59  78  87  51  53  86  52  57  75  98  74  79  64  92  61  99  58
 [76]  94 107  67  85  56  91  62  72  68  83  93  95 100  89  96  97  63  65  80  60  54  90  70  81  82
[101]  42  30  31  26  10  35  13   2  46  36   5  38  28  29  41   1  18  50   8  40  23   7  43   3   4
[126]  48  14   9  39  17  33  34  15  16   6  19  21  32  37  11  49  45  47  20  22  44  24  27  12  25

#order dataset by hierarchical clustering
data_ordered <- iris[h$order,]

head(data_ordered)
    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
108          7.3         2.9          6.3         1.8 virginica
131          7.4         2.8          6.1         1.9 virginica
103          7.1         3.0          5.9         2.1 virginica
126          7.2         3.2          6.0         1.8 virginica
130          7.2         3.0          5.8         1.6 virginica
119          7.7         2.6          6.9         2.3 virginica

#perform the SVD
svd1 <- svd(data_ordered[,c(1:4)])

#inside the svd1 object are the 3 separate matrices
names(svd1)
[1] "d" "u" "v"

#left singular vectors corresponding to the rows
dim(svd1$u)
[1] 150   4

#right singular vectors corresponding to the columns
dim(svd1$v)
[1] 4 4

#singular vectors
svd1$d
[1] 95.959914 17.761034  3.460931  1.884826

head(svd1$u)
           [,1]       [,2]        [,3]         [,4]
[1,] -0.1054558 0.08012812 -0.10637239 -0.134454550
[2,] -0.1049482 0.07556145 -0.12829465 -0.009698646
[3,] -0.1026729 0.07009470 -0.01813207  0.036368121
[4,] -0.1042575 0.06052312 -0.03846043 -0.125453575
[5,] -0.1020462 0.05482986 -0.11193348 -0.120558750
[6,] -0.1114810 0.11657800 -0.13510080  0.030542780

#plot all left singular vectors
par(mfrow=c(1,4))
plot(svd1$u[,1],1:150,pch=19)
plot(svd1$u[,2],1:150,pch=19)
plot(svd1$u[,3],1:150,pch=19)
plot(svd1$u[,4],1:150,pch=19)
#reset graphical parameter
par(mfrow=c(1,1))

svd_left_vector_irisThe first, second, third and fourth left singular vectors.

How do the original data points look?

#the hierarchical clustering order ordered the species nicely
table(data_ordered[ c(1:50),5])

    setosa versicolor  virginica 
         0          3         47

table(data_ordered[ c(51:100),5])

    setosa versicolor  virginica 
         0         47          3

table(data_ordered[ c(101:150),5])

    setosa versicolor  virginica 
        50          0          0

plot(data_ordered$Sepal.Length,c(1:150),pch=19,xlim=c(0,8.1),xlab="Length")
points(data_ordered$Sepal.Width,c(1:150),col=2, pch=19)
points(data_ordered$Petal.Length,c(1:150),col=3, pch=19)
points(data_ordered$Petal.Width,c(1:150),col=4, pch=19)
abline(h=50)
abline(h=100)

iris_scatterplot_all_variable_ablineEach measurement is coloured in a different colour. The hierarchical clustering order makes it easier to see different properties of different species.

Plotting the first and second left singular vectors and colouring by species.

first <- svd1$u[,1]
second <- svd1$u[,2]
species <- data_ordered$Species
species <- as.numeric(species)
first <- data.frame(first,species)
second <- data.frame(second,species)
plot(first$first, second$second, pch=19, col=first$species, xlab="First left singular vector", ylab="Second left singular vector")

first_second_left_singular_vectorIn green are virginica, red are versicolor and black are setosa.

Variance explained by the number of singular vectors

svd1$d^2/sum(svd1$d^2)
[1] 0.9653029807 0.0330689513 0.0012556535 0.0003724145
plot(svd1$d^2/sum(svd1$d^2), pch=19, xlab="Singluar vector", ylab="Variance explained")

variance_explained_irisOne singular vector is enough to explain 96.5% of the variance.

Obtaining the original matrix

#multiply each singular matrix to obtain the original
original <- svd1$u[,1:4] %*% diag(svd1$d[1:4]) %*% t(svd1$v[,1:4])
head(original)
     [,1] [,2] [,3] [,4]
[1,]  7.3  2.9  6.3  1.8
[2,]  7.4  2.8  6.1  1.9
[3,]  7.1  3.0  5.9  2.1
[4,]  7.2  3.2  6.0  1.8
[5,]  7.2  3.0  5.8  1.6
[6,]  7.7  2.6  6.9  2.3
head(data_ordered)
    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
108          7.3         2.9          6.3         1.8 virginica
131          7.4         2.8          6.1         1.9 virginica
103          7.1         3.0          5.9         2.1 virginica
126          7.2         3.2          6.0         1.8 virginica
130          7.2         3.0          5.8         1.6 virginica
119          7.7         2.6          6.9         2.3 virginica

Summary

Singular Vector Decomposition can reduce a large matrix of values into 3 separate matrices, each having special properties. The right singular vectors are actually the same as the principal components in a PCA (see this article for more information). The left singular vectors correspond to the rows of the matrix and showed that setosa are characteristic different from the other two species. The diagonal matrix D provides the amount of variance explained by the number of singular vectors.

SVD was taught in week 3 of the Data Analysis course provided by coursera. Some of the code in this post was adapted from the dimension reduction lecture; please refer to the lecture for more information.

Posted in Statistics | Tagged | Leave a comment

Fitting a Michaelis-Menten curve using R

Many biological phenomena follow four different types of relationships that include sigmoid, exponential, linear and Michaelis-Menten type relationships. This is just a short post on fitting data to a Michaelis-Menten curve using the drc package for R available in CRAN.

#install if necessary
install.packages("drc")
library(drc)

x <- seq(from=0,to=10e6,length=11)
x
 [1] 0e+00 1e+06 2e+06 3e+06 4e+06 5e+06 6e+06 7e+06 8e+06 9e+06 1e+07
y <- c(0, 61200.5, 102707.0, 138467.5, 171614.0, 202326.5, 230900.5, 258481.5, 284502.6, 309394.2, 333246.3)
df <- data.frame(x,y)
df
       x        y
1  0e+00      0.0
2  1e+06  61200.5
3  2e+06 102707.0
4  3e+06 138467.5
5  4e+06 171614.0
6  5e+06 202326.5
7  6e+06 230900.5
8  7e+06 258481.5
9  8e+06 284502.6
10 9e+06 309394.2
11 1e+07 333246.3

m1 <- drm(y ~ x, data = df, fct = MM.2())
summary(m1)

Model fitted: Michaelis-Menten (2 parms)

Parameter estimates:

                Estimate Std. Error    t-value p-value
d:(Intercept) 8.1177e+05 5.0017e+04 1.6230e+01       0
e:(Intercept) 1.4721e+07 1.3435e+06 1.0957e+01       0

Residual standard error:

 4643.671 (9 degrees of freedom)

plot(m1,log='',xlim=c(0,5e8), ylim=c(0,1e6), xlab="Reads", ylab="Transcripts")

michaelis_mentens_curveThe fitted Michaelis-Menten curve.

The values I used above for constructing the Michaelis-Menten curve were values predicted using a software tool called preseq. Here I show the two curves:

plot(m1,log='',xlim=c(0,5e8), ylim=c(0,1.5e6), xlab="Reads", ylab="Transcripts")
lines(p2$TOTAL_READS1,p2$EXPECTED_DISTINCT1,col=2)

michaelis_mentens_vs_preseqPreseq predicts more detectable transcripts as we sequence deeper.

Conclusions

Last week I was looking at curve fitting in R, and didn't include Michaelis-Menten type curves. Even before that I was looking at discovery plots, which is a plot that shows the number of discovered transcripts/genes as you perform deeper sequencing in a high throughput sequencing experiment. As a silly idea, I thought about fitting a Michaelis-Mentens curve to a discovery plot. Note that the data I used above were projected values using the preseq software I linked above. I will write about preseq in a later post.

See also

A well written manual/tutorial on using the drc (dose-response curves) package in R.

Posted in biology | Tagged | Leave a comment

Coding potential of non-coding RNA

The Coding Potential Assessment Tool (CPAT) was developed to assess the coding potential of RNA sequences without the need of sequence alignment. More information on the tool can be found at their website and corresponding publication.

In this post, I test CPAT by inputting a set of known protein coding RNAs (RefSeqs) and a set of putative non-coding RNAs taken from GENCODE. I use the logit model file and the hexamer frequency table files that were provided with CPAT.

wget ftp://ftp.sanger.ac.uk/pub/gencode/release_16/gencode.v16.lncRNA_transcripts.fa.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz
python ~/bin/CPAT/usr/local/bin/cpat.py -d ~/src/CPAT-1.2.1/dat/Human_train.RData -x ~/src/CPAT-1.2.1/dat/Human_Hexamer.tab -g gencode.v16.lncRNA_transcripts.fa -o lnc_rna_cpat.out
python ~/bin/CPAT/usr/local/bin/cpat.py -d ~/src/CPAT-1.2.1/dat/Human_train.RData -x ~/src/CPAT-1.2.1/dat/Human_Hexamer.tab -g human.rna.fna.gz -o refseq
#stats is available here https://github.com/arq5x/filo
#coding potential of lncRNA
cat lnc_rna_cpat.out | cut -f6 | stats
Total lines:            22444
Sum of lines:           1773.01404739716
Ari. Mean:              0.0789972396808571
Geo. Mean:              0.0157839580319542
Median:                 0.0157583998480091
Mode:                   1 (N=22)
Anti-Mode:              2.22144653496105e-13 (N=1)
Minimum:                2.22144653496105e-13
Maximum:                1
Variance:               0.0332286952048533
StdDev:                 0.182287397273792
#coding potential of mRNA
cat refseq | cut -f6 | stats
Total lines:            47251
Sum of lines:           36677.7025936008
Ari. Mean:              0.776231245764128
Geo. Mean:              0.405030720938695
Median:                 0.999573383225558
Mode:                   1 (N=3820)
Anti-Mode:              2.26544376749313e-13 (N=1)
Minimum:                2.26544376749313e-13
Maximum:                1
Variance:               0.141651908600363
StdDev:                 0.376366720899129

Input into R

refseq <- read.table("refseq",header=T)
lncrna <- read.table("lnc_rna_cpat.out",header=T)
head(refseq)
                              mRNA_size ORF_size Fickett_score Hexamer_score
GI|239744030|REF|XR_017086.3|      3159     1521        1.1446    0.34745117
GI|239745142|REF|XR_040656.2|       747      747        0.9367    0.25149694
GI|239746044|REF|XR_078542.1|      1338      681        0.8234    0.05853078
GI|239746948|REF|XR_037268.2|      1057      429        0.7305    0.00207399
GI|239749771|REF|XR_038087.2|      3159     1521        1.1446    0.34918451
GI|239751533|REF|XR_079389.1|      1338      681        0.8234    0.05091209
                              coding_prob
GI|239744030|REF|XR_017086.3|   0.9999992
GI|239745142|REF|XR_040656.2|   0.9894146
GI|239746044|REF|XR_078542.1|   0.8964281
GI|239746948|REF|XR_037268.2|   0.2289991
GI|239749771|REF|XR_038087.2|   0.9999992
GI|239751533|REF|XR_079389.1|   0.8916638
head(lncrna)
                                                                                                                  mRNA_size
ENST00000473358.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-11-001|MIR1302-11|712|            712
ENST00000469289.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-11-002|MIR1302-11|535|            535
ENST00000466430.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003225.1|RP11-34P13.7-001|RP11-34P13.7|2748|      2748
ENST00000477740.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003688.1|RP11-34P13.7-003|RP11-34P13.7|491|        491
ENST00000471248.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003687.1|RP11-34P13.7-002|RP11-34P13.7|629|        629
ENST00000453576.2|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003689.1|RP11-34P13.7-004|RP11-34P13.7|336|        336
                                                                                                                  ORF_size
ENST00000473358.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-11-001|MIR1302-11|712|           228
ENST00000469289.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-11-002|MIR1302-11|535|           111
ENST00000466430.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003225.1|RP11-34P13.7-001|RP11-34P13.7|2748|      240
ENST00000477740.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003688.1|RP11-34P13.7-003|RP11-34P13.7|491|       258
ENST00000471248.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003687.1|RP11-34P13.7-002|RP11-34P13.7|629|       258
ENST00000453576.2|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003689.1|RP11-34P13.7-004|RP11-34P13.7|336|       114
                                                                                                                  Fickett_score
ENST00000473358.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-11-001|MIR1302-11|712|             0.8192
ENST00000469289.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-11-002|MIR1302-11|535|             0.6819
ENST00000466430.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003225.1|RP11-34P13.7-001|RP11-34P13.7|2748|        1.1129
ENST00000477740.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003688.1|RP11-34P13.7-003|RP11-34P13.7|491|         0.9340
ENST00000471248.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003687.1|RP11-34P13.7-002|RP11-34P13.7|629|         0.6732
ENST00000453576.2|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003689.1|RP11-34P13.7-004|RP11-34P13.7|336|         1.1303
                                                                                                                  Hexamer_score
ENST00000473358.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-11-001|MIR1302-11|712|        0.089700116
ENST00000469289.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-11-002|MIR1302-11|535|        0.004253738
ENST00000466430.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003225.1|RP11-34P13.7-001|RP11-34P13.7|2748|  -0.036719025
ENST00000477740.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003688.1|RP11-34P13.7-003|RP11-34P13.7|491|    0.066306017
ENST00000471248.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003687.1|RP11-34P13.7-002|RP11-34P13.7|629|   -0.262129058
ENST00000453576.2|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003689.1|RP11-34P13.7-004|RP11-34P13.7|336|    0.248847902
                                                                                                                  coding_prob
ENST00000473358.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-11-001|MIR1302-11|712|      0.075234173
ENST00000469289.1|ENSG00000243485.1|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-11-002|MIR1302-11|535|      0.008503811
ENST00000466430.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003225.1|RP11-34P13.7-001|RP11-34P13.7|2748| 0.082058427
ENST00000477740.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003688.1|RP11-34P13.7-003|RP11-34P13.7|491|  0.122934737
ENST00000471248.1|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003687.1|RP11-34P13.7-002|RP11-34P13.7|629|  0.006955770
ENST00000453576.2|ENSG00000238009.2|OTTHUMG00000001096.2|OTTHUMT00000003689.1|RP11-34P13.7-004|RP11-34P13.7|336|  0.155721804
boxplot(refseq$coding_prob, lncrna$coding_prob, main="Coding Probability", names=c("RefSeq","lncRNA"))

cpat_refseq_lncrnaThe coding probability of RefSeq RNAs are very high compared to lncRNAs, as expected.

install.packages("rgl")
library(rgl)
dim(refseq)
#[1] 47251     5
refseq$colour <- rep(2,47251)
dim(lncrna)
#[1] 22444     5
lncrna$colour <- rep(3,22444)
combined <- rbind(refseq, lncrna)
plot3d(combined[,c(2:4)],col=combined$colour)
rgl.postscript("cpat.pdf","pdf")
#install if necessary
sudo apt-get install imagemagick
convert cpat.pdf cpat.png

cpatIn red are RefSeq mRNAs and in green are lncRNAs. The several long ORFs in the RefSeq mRNAs spreads out the plot.

Remove the long ORFs

summary(combined$ORF_size)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0    192.0    510.0    994.6   1356.0 108000.0
table(combined$ORF_size>40000)

FALSE  TRUE 
69688     7
new <- combined[combined$ORF_size<40000,]
dim(new)
[1] 69688     6
dim(combined)
[1] 69695     6
plot3d(new[,c(2:4)],col=new$colour)
rgl.postscript("new_cpat.pdf","pdf")

new_cpatMost of the lncRNA have very short or no ORFs.

FANTOM3 clones

One of the main findings of FANTOM3 was the appreciation that many transcribed RNAs were not protein coding or had very low coding potential. As part of the FANTOM project, a collection of full-length cDNAs were made; there are a total of 102,801 full-length cDNAs. I wonder how many of them have a very low coding potential. As per above:

python ~/bin/CPAT/usr/local/bin/cpat.py -d ~/src/CPAT-1.2.1/dat/Human_train.RData -x ~/src/CPAT-1.2.1/dat/Human_Hexamer.tab -g fantom3_cdna.fa -o fantom3.out
fantom<- read.table("fantom3.out",header=T)
summary(fantom$coding_prob)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0179  0.8294  0.5538  0.9998  1.0000
summary(lncrna$coding_prob)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.004789 0.015760 0.079000 0.053620 1.000000
#how many of the FANTOM cDNAs have a coding potential
#as low as most lncRNAs
table(fantom$coding_prob<0.016)

FALSE  TRUE 
78163 24638
png("fantom_coding_potential.png",width=800,height=500)
hist(fantom$coding_prob,breaks=100)
dev.off()

fantom_coding_potentialAbout 25% of the FANTOM3 full-length cDNA clones seem to be non-coding.

Conclusion

The Coding Potential Assessment Tool (CPAT) is a very fast tool for assessing the coding potential of many RNA transcripts. Inputting a set of known protein coding RNAs versus a set of non-coding RNAs returned results as expected, which was that the former had a much higher coding potential than the latter class of RNAs. Lastly, using CPAT on FANTOM3 full-length cDNAs seemed to confirm that a relatively large portion of transcribed RNAs are non-coding.

Posted in bioinformatics, genomics | Tagged | Leave a comment

Using the R SeqinR package

Just a quick demonstration of the SeqinR package in R using sequences available from the NONCODE database version 3. First download the fasta file, which is available at http://noncode.org/NONCODERv3/download.htm, then install the necessary packages in R and load the fasta file (note I have extracted and renamed the fasta file into ncrna_noncode_v3.fa):

install.packages("seqinr")
library("seqinr")
ncrna <- read.fasta(file = "ncrna_noncode_v3.fa")
#how many fasta sequences
length(ncrna)
[1] 411553

n1 <- ncrna[[2]]
n1
  [1] "a" "c" "c" "t" "c" "g" "a" "c" "c" "a" "c" "c" "c" "t" "t" "a" "a" "c" "t" "t" "g" "g" "g" "t" "g" "c" "a" "g" "g"
 [30] "t" "a" "t" "t" "c" "a" "a" "c" "a" "a" "a" "a" "g" "c" "a" "a" "t" "g" "a" "a" "t" "c" "a" "a" "g" "g" "a" "a" "t"
 [59] "g" "a" "a" "t" "c" "a" "a" "t" "g" "g" "a" "t" "t" "t" "t" "c" "a" "a" "t" "g" "g" "a" "t" "t" "t" "a" "t" "g" "g"
 [88] "a" "t" "t" "t" "t" "a" "a" "a" "a" "a" "c" "a" "g" "a" "g" "a" "a" "c" "t" "c" "a" "g" "a" "a" "a" "t" "c" "t" "a"
[117] "a" "c" "a" "g" "a" "a" "a" "t" "t" "t" "a" "a" "c" "a" "g" "a" "a" "a" "t" "t" "t" "a" "a" "a" "t" "t" "t" "g" "t"
[146] "c" "g" "a" "t" "c" "t" "a" "c" "a" "a" "a" "t" "t" "g" "c" "c" "c" "t" "t" "a" "t" "c" "t" "t" "t" "t" "t" "c" "c"
[175] "a" "t" "c" "t" "t" "a" "a" "a" "c" "t" "a" "a" "a" "c" "g" "t" "t" "a" "a" "t" "a" "a" "c" "t" "t" "a" "t" "t" "g"
[204] "t" "t" "g" "t" "t" "g" "a" "a" "t" "a" "c" "a" "g" "c" "t" "t" "g" "t" "g" "g" "a" "a" "t" "g" "t" "c" "g" "g" "g"
[233] "g" "t" "a" "c" "a" "a" "t" "g" "t" "c" "g" "g" "g" "g"
attr(,"name")
[1] "n1"
attr(,"Annot")
[1] ">n1 | AB002583 | tmRNA | chloroplast Cyanidioschyzon merolae | ssrA | NONCODE v2.0 | NULL | NULL | -1.0577600 | -0.3460597"
attr(,"class")
[1] "SeqFastadna"

#count number of nucleotides
count(n1,1)

 a  c  g  t 
86 40 44 76

#count number of dinucleotides
count(n1,2)
aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt 
38 15  9 24 16  7  5 12 15  4 14 10 16 14 16 30

#GC content
GC(n1)
[1] 0.3414634

#Store the fasta header
annotation <- getAnnot(ncrna)
#count the number of piRNAs
pirna_index <- grep("piRNA",annotation,ignore.case=T)
length(pirna_index)
[1] 174724
pirna <- ncrna[pirna_index]
#count the number of human piRNAs
pirna_annotation <- getAnnot(pirna)
pirna_human_index <- grep("Homo",pirna_annotation,ignore.case=T)
length(pirna_human_index)
[1] 32152
pirna_human <- ncrna[pirna_human_index]
length(pirna_human)
[1] 32152
pirna_human_sequence <- getSequence(pirna_human)
#write a function to return the tenth base
tenth_base <- function(x){
   return(x[10])
}

table(mapply(tenth_base,pirna_human_sequence))

    a     c     g     n     t
10154  6294  7958     9  7737
#write a function to return the first base
first_base <- function(x){
   return(x[1])
}
table(mapply(first_base,pirna_human_sequence))

    a     c     g     t
  795  2884  3051 25422
#length distribution of all human piRNAs
table(getLength(pirna_human))

  26   27   28   29   30   31   32
3552 3540 4127 7166 8216 4410 1141

lengths <- table(getLength(pirna_human))
barplot(lengths, xlab="piRNA lengths", ylab="Frequency")

human_pirna_length_noncode_v3

Conclusions

Using the SeqinR package in R and piRNA sequences provided by the NONCODE database, we could obtain basic sequence statistics on a particular class of noncoding RNA called piwi-interacting RNAs (piRNAs). piRNAs are a class of noncoding RNAs that are roughly 26-31nt long, which we could observe in our subset of human piRNAs. They are also known to have a U/T at the first base, which we observed, and an A at the tenth base, which we could observe slightly. We could perform some statistics to see if there were an enrichment.

For further information on the capabilities of this package, you should refer to the reference manual, which is linked below. The package provides three pages worth of functions.

More info

The SeqinR man page.

Posted in R | Tagged , | Leave a comment