Parallel distance calculation in R

A quick and short post on parallel distance calculation in R using the mclapply() function from the parallel package. I'll use data from the Biobase and datamicroarray packages to illustrate.

Install the Biobase package and load the library.

source("http://bioconductor.org/biocLite.R")
biocLite("Biobase")
library(Biobase)

We'll use the geneData data set first.

data(geneData, package = "Biobase")
dim(geneData)
[1] 500  26

How many row by row comparisons?

choose(500,2)
[1] 124750

Use the combn() function to generate all pairwise combinations.

my_pair <- combn(1:nrow(geneData), 2, simplify = FALSE)

length(my_pair)
[1] 124750

head(my_pair)
[[1]]
[1] 1 2

[[2]]
[1] 1 3

[[3]]
[1] 1 4

[[4]]
[1] 1 5

[[5]]
[1] 1 6

[[6]]
[1] 1 7

Function to calculate Euclidean distance using a list of indexes.

my_dist <- function(pair, data){
  sqrt(sum( (data[pair[1],] - data[pair[2],])^2 ))
}

Calculate distances and create distance matrix.

system.time(out <- lapply(my_pair, my_dist, data = geneData))
   user  system elapsed 
  0.684   0.000   0.683

# install if necessary
# install.packages('parallel')
library(parallel)

# doesn't save you that much time, but it's a small data set
system.time(out2 <- mclapply(my_pair, my_dist, data = geneData, mc.cores = 4))
   user  system elapsed 
  1.296   0.752   0.490

length(out2)
[1] 124750

# check if they are identical
identical(out, out2)
[1] TRUE

# now to store as a matrix
my_matrix <- diag(nrow(geneData))
diag(my_matrix) <- 0
my_matrix[lower.tri(my_matrix)] <- unlist(out2)
# this ugly looking bit is due to the matrix filling up by columns instead of rows
my_matrix[upper.tri(my_matrix)] <- t(my_matrix)[upper.tri(t(my_matrix))]
row.names(my_matrix) <- row.names(geneData)
colnames(my_matrix)  <- row.names(geneData)

# easy way
using_dist <- as.matrix(dist(geneData))

my_matrix[1:6, 1:6]
                AFFX-MurIL2_at AFFX-MurIL10_at AFFX-MurIL4_at AFFX-MurFAS_at AFFX-BioB-5_at AFFX-BioB-M_at
AFFX-MurIL2_at          0.0000        296.0283      508.18259      568.03033      351.10147      339.01319
AFFX-MurIL10_at       296.0283          0.0000      333.56006      384.80776      216.84989      204.14356
AFFX-MurIL4_at        508.1826        333.5601        0.00000       80.26229      195.23304      200.74868
AFFX-MurFAS_at        568.0303        384.8078       80.26229        0.00000      248.00470      258.12776
AFFX-BioB-5_at        351.1015        216.8499      195.23304      248.00470        0.00000       87.06816
AFFX-BioB-M_at        339.0132        204.1436      200.74868      258.12776       87.06816        0.00000

using_dist[1:6,1:6]
                AFFX-MurIL2_at AFFX-MurIL10_at AFFX-MurIL4_at AFFX-MurFAS_at AFFX-BioB-5_at AFFX-BioB-M_at
AFFX-MurIL2_at          0.0000        296.0283      508.18259      568.03033      351.10147      339.01319
AFFX-MurIL10_at       296.0283          0.0000      333.56006      384.80776      216.84989      204.14356
AFFX-MurIL4_at        508.1826        333.5601        0.00000       80.26229      195.23304      200.74868
AFFX-MurFAS_at        568.0303        384.8078       80.26229        0.00000      248.00470      258.12776
AFFX-BioB-5_at        351.1015        216.8499      195.23304      248.00470        0.00000       87.06816
AFFX-BioB-M_at        339.0132        204.1436      200.74868      258.12776       87.06816        0.00000

# what!?
identical(my_matrix, using_dist)
[1] FALSE

# read http://stackoverflow.com/questions/9508518/why-are-these-numbers-not-equal
all.equal(my_matrix, using_dist)
[1] TRUE

Now using a bigger data set.

library(devtools)
install_github('ramhiser/datamicroarray')

library(datamicroarray)
describe_data()
        author year   n     p  K              Disease
1         alon 1999  62  2000  2         Colon Cancer
2    borovecki 2005  31 22283  2 Huntington's Disease
3   burczynski 2006 127 22283  3      Crohn's Disease
4    chiaretti 2004 111 12625  2             Leukemia
5         chin 2006 118 22215  2        Breast Cancer
6     chowdary 2006 104 22283  2        Breast Cancer
7  christensen 2009 217  1413  3                  N/A
8        golub 1999  72  7129  3             Leukemia
9       gordon 2002 181 12533  2          Lung Cancer
10     gravier 2010 168  2905  2        Breast Cancer
11        khan 2001  63  2308  4                SRBCT
12    nakayama 2001 105 22283 10              Sarcoma
13     pomeroy 2002  60  7128  2            CNS Tumor
14       shipp 2002  58  6817  2             Lymphoma
15       singh 2002 102 12600  2      Prostate Cancer
16      sorlie 2001  85   456  5        Breast Cancer
17          su 2002 102  5565  4                  N/A
18 subramanian 2005  50 10100  2                  N/A
19         sun 2006 180 54613  4               Glioma
20        tian 2003 173 12625  2              Myeloma
21        west 2001  49  7129  2        Breast Cancer
22        yeoh 2002 248 12625  6             Leukemia

data('gravier', package = 'datamicroarray')
dim(gravier$x)
[1]  168 2905

data <- t(gravier$x)

choose(nrow(data), 2)
[1] 4218060

library(parallel)
my_pair <- combn(1:nrow(data), 2, simplify = FALSE)
my_dist <- function(pair, data){
  sqrt(sum( (data[pair[1],] - data[pair[2],])^2 ))
}

# this is using my 5 year old desktop
system.time(out <- lapply(my_pair, my_dist, data = data))
   user  system elapsed 
  63.00    0.00   62.99

system.time(out2 <- mclapply(my_pair, my_dist, data = data, mc.cores = 4))
   user  system elapsed 
  4.928   0.412  38.759

identical(out, out2)
[1] TRUE

my_matrix <- diag(nrow(data))
diag(my_matrix) <- 0
my_matrix[lower.tri(my_matrix)] <- unlist(out2)
my_matrix[upper.tri(my_matrix)] <- t(my_matrix)[upper.tri(t(my_matrix))]
row.names(my_matrix) <- row.names(data)
colnames(my_matrix)  <- row.names(data)

using_dist <- as.matrix(dist(data))
all.equal(my_matrix, using_dist)
[1] TRUE

Summary

It was actually much quicker to use the dist() function.

system.time(using_dist <- as.matrix(dist(data)))
   user  system elapsed 
  5.476   0.028   5.502

Try a bigger data set.

# this entire section was done on the server; my desktop would struggle
data('shipp', package = 'datamicroarray')
data <- t(shipp$x)
system.time(using_dist <- as.matrix(dist(data)))
   user  system elapsed 
 23.132   2.972  26.132

my_dist <- function(pair, data){
  sqrt(sum( (data[pair[1],] - data[pair[2],])^2 ))
}
system.time(my_pair <- combn(1:nrow(data), 2, simplify = FALSE))
   user  system elapsed 
 31.516   4.540  36.097

system.time(out2 <- mclapply(my_pair, my_dist, data = data, mc.cores = 64))
    user   system  elapsed
1042.256 1298.740  188.634

system.time(out2 <- mclapply(my_pair, my_dist, data = data, mc.cores = 16))
    user   system  elapsed
1361.696 2260.428  111.345

my_matrix <- diag(nrow(data))
diag(my_matrix) <- 0
my_matrix[lower.tri(my_matrix)] <- unlist(out2)
my_matrix[upper.tri(my_matrix)] <- t(my_matrix)[upper.tri(t(my_matrix))]
row.names(my_matrix) <- row.names(data)
colnames(my_matrix)  <- row.names(data)
all.equal(my_matrix, using_dist)
[1] TRUE

In conclusion, if you want to calculate all pairwise Euclidean distances, just use the dist() function. The mclapply() function from the parallel package is faster than lapply(). Using more cores does not necessarily mean faster results; it depends on how big your data is, since time is spent splitting and joining your data set.

Print Friendly



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.
Posted in RTagged
2 comments Add yours
    1. Try using foreach loop construct, it could be faster than mclapply because mclapply can only use cores of 1 machine. Foreach uses doMPI and Rmpi as parallel backend to solve this problem.

Leave a Reply

Your email address will not be published. Required fields are marked *

Time limit is exhausted. Please reload CAPTCHA.