R data visualisation

Once upon a time, I made my graphs using Excel because it was the only software that I was aware of for making graphs. Now one can do amazing things with Excel and produce fairly good looking graphs, but after looking at some examples of R graphs, I wanted to learn a bit more about R data visualisation. This post mentions some new plotting functions I recently discovered and have found useful.

Using identify()

This lecture on R data visualisation provides a very nice introduction to producing plots in R, which I highly recommend, so check it out. One really cool trick I learned from the course was the use of identify(), which let's you identify points in a scatter plot. Imagine you were comparing the expression of genes from two different libraries and you wanted to know which gene a particular dot represented. Here's how:

Continue reading

Making a heatmap with R

Update 28th March 2017: I recommend using the superheat package for creating heatmaps.

Heatmaps are great for visualising large tables of data; they are definitely popular in many transcriptome papers. Here are the basic commands for making your own heatmap:

data <- read.table("test.txt",sep="\t",header=TRUE,row.names=1)
data_matrix <- data.matrix(data)
heatmap(data_matrix)
#For more information see help(heatmap)
#If you want cooler colours than the default
install.packages("RColorBrewer")
library("RColorBrewer")
#display all colour schemes
display.brewer.all()
heatmap(data_matrix,col=brewer.pal(9,"Blues"))
#if you want to preserve the column order
#since the order may be informative
heatmap(data_matrix,Colv=NA,col=brewer.pal(9,"Blues"))

display_brewer_allThe colour schemes from the function display.brewer.all()

Returning the values used for the heatmap

Taken from the email thread: [BioC] heatmap.2() get matrix after hierarchical clustering


## Some input sample matrix
y <- matrix(rnorm(50), 10, 5, dimnames=list(paste("g", 1:10, sep=""), paste("t", 1:5, sep="")))

y
             t1          t2          t3           t4         t5
g1   0.09010160  0.18099063  0.06491397  0.976286072 -0.1365132
g2   1.46478627  0.07811639  0.92728521 -0.006493938 -0.4911211
g3   0.31297844 -1.56484613 -0.85887941  0.353068737 -1.0221706
g4   0.09407695  0.43792430  0.46242189  1.491145523 -0.7022191
g5   0.69112967  0.36843583  1.19093386 -0.032736910  1.8298866
g6  -1.02819430 -0.43686436  0.49728848 -0.120586171 -2.9141145
g7   0.47593269 -1.87135877  1.02256261 -1.036563429  0.8059392
g8  -0.10274882 -0.50991134  1.38989532  0.416684239 -0.4231279
g9  -1.17358163 -0.93794222 -0.83479604 -0.872205800  0.1767881
g10  0.23001732  0.10358520  0.69702767 -1.797501815 -0.4973721

## Run heatmap.2 on this matrix
install.packages("gplots")
library(gplots)
test <- heatmap.2(y)
y[rev(test$rowInd), test$colInd]

            t5          t2           t4          t1          t3
g5   1.8298866  0.36843583 -0.032736910  0.69112967  1.19093386
g7   0.8059392 -1.87135877 -1.036563429  0.47593269  1.02256261
g10 -0.4973721  0.10358520 -1.797501815  0.23001732  0.69702767
g4  -0.7022191  0.43792430  1.491145523  0.09407695  0.46242189
g1  -0.1365132  0.18099063  0.976286072  0.09010160  0.06491397
g8  -0.4231279 -0.50991134  0.416684239 -0.10274882  1.38989532
g2  -0.4911211  0.07811639 -0.006493938  1.46478627  0.92728521
g6  -2.9141145 -0.43686436 -0.120586171 -1.02819430  0.49728848
g3  -1.0221706 -1.56484613  0.353068737  0.31297844 -0.85887941
g9   0.1767881 -0.93794222 -0.872205800 -1.17358163 -0.83479604

## Row clustering (adjust here distance/linkage methods to what you need!)
hr <- hclust(as.dist(1-cor(t(y), method="pearson")), method="complete")

## Column clustering (adjust here distance/linkage methods to what you need!)
hc <- hclust(as.dist(1-cor(y, method="spearman")), method="complete")

## Plot heatmap
heatmap.2(y, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hc), scale="row", density.info="none", trace="none")

## Return matrix with row/column sorting as in heatmap
y[rev(hr$labels[hr$order]), hc$labels[hc$order]]

             t2           t4          t1          t3         t5
g7  -1.87135877 -1.036563429  0.47593269  1.02256261  0.8059392
g5   0.36843583 -0.032736910  0.69112967  1.19093386  1.8298866
g9  -0.93794222 -0.872205800 -1.17358163 -0.83479604  0.1767881
g10  0.10358520 -1.797501815  0.23001732  0.69702767 -0.4973721
g2   0.07811639 -0.006493938  1.46478627  0.92728521 -0.4911211
g4   0.43792430  1.491145523  0.09407695  0.46242189 -0.7022191
g1   0.18099063  0.976286072  0.09010160  0.06491397 -0.1365132
g3  -1.56484613  0.353068737  0.31297844 -0.85887941 -1.0221706
g8  -0.50991134  0.416684239 -0.10274882  1.38989532 -0.4231279
g6  -0.43686436 -0.120586171 -1.02819430  0.49728848 -2.9141145

test <- heatmap.2(y)

heatmap.2(y, Rowv=as.dendrogram(hr), Colv=as.dendrogram(hc), scale="row", density.info="none", trace="none")

An example of creating a heatmap using transcriptome data

I will use the TagSeqExample.tab file from DESeq.

#install DESeq if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("DESeq")
#load DESeq
library("DESeq")
example_file <- system.file ("extra/TagSeqExample.tab", package="DESeq")
data <- read.delim(example_file, header=T, row.names="gene")
head(data)
           T1a T1b  T2  T3  N1  N2
Gene_00001   0   0   2   0   0   1
Gene_00002  20   8  12   5  19  26
Gene_00003   3   0   2   0   0   0
Gene_00004  75  84 241 149 271 257
Gene_00005  10  16   4   0   4  10
Gene_00006 129 126 451 223 243 149
nrow(data)
[1] 18760
#check distribution of row sums
quantile(rowSums(data))
    0%    25%    50%    75%   100% 
     0     31    228    883 234506
#create a workable set
data_subset <- data[rowSums(data)>50000,]
nrow(data_subset)
[1] 49
data_matrix <- data.matrix(data_subset)
heatmap(data_matrix)

Now using heatmap.2():

#install if necessary
install.packages("gplots")
library(gplots)
heatmap.2(data_matrix)

heatmap.2(data_matrix,scale="row")

#using a red and blue colour scheme without traces and scaling each row
library("RColorBrewer")
heatmap.2(data_matrix,col=brewer.pal(11,"RdBu"),scale="row", trace="none")

deseq_heatmap2_rdbu

#picking your own colours
#black and red
colfunc <- colorRampPalette(c("black", "red"))
heatmap.2(data_matrix,col=colfunc(15),scale="row", trace="none")

heatmap_2_black_red

#picking your own colours
#black, white and red
#to see all colours type colours()
colfunc <- colorRampPalette(c("black", "white", "red"))
heatmap.2(data_matrix,col=colfunc(15),scale="row", trace="none")

heatmap_2_black_white_redPerhaps you can pick better colours than me!.

Manually checking some of the values in heatmap.2()

#The first row in the heatmap
data_matrix["Gene_08743",]
  T1a   T1b    T2    T3    N1    N2 
10404  4626 32103 14288 19626 16079 
summary(data_matrix["Gene_08743",])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   4626   11380   15180   16190   18740   32100
test <- heatmap.2(data_matrix,scale="row")
#create function to calculate z-score
z_score <- function(x){
+    (x-mean(x))/sd(x)
+ }
z_score(data_matrix["Gene_08743",])
        T1a         T1b          T2          T3          N1          N2 
-0.61945966 -1.23831240  1.70461190 -0.20346381  0.36826272 -0.01163874
#compare the manually calculating values with the ones used for the heatmap
#they are the same, except the ordering
test$carpet[,"Gene_08743"]
         N2         T1b         T1a          T2          T3          N1 
-0.01163874 -1.23831240 -0.61945966  1.70461190 -0.20346381  0.36826272