I calculate the correlations (Spearman’s rank) row by row using R, creating a correlation matrix.
#create random matrix with numbers ranging from 1 to 100
data <- matrix(runif(100,1,100),nrow=10,ncol=10,byrow=T)
#loop through all possible pairwise rows
for (i in 1:(nrow(data)-1)){
for (j in (i+1):nrow(data)){
print(c(i, j))
}
}
#calculating the Spearman correlation between row 1 and row 2
cor(as.vector(as.matrix(data[1,])),as.vector(as.matrix(data[2,])),method="spearman")
#initialise a matrix for storing the correlations
final_matrix <- matrix(rep("0",nrow(data)*nrow(data)),ncol=nrow(data),nrow=nrow(data))
#calculate all correlations and store in final_matrix
for (i in 1:(nrow(data)-1)){
final_matrix[i,i] <- 1
for (j in (i+1):nrow(data)){
final_matrix[i,j] <- cor(as.vector(as.matrix(data[i,])),as.vector(as.matrix(data[j,])),method="spearman")
final_matrix[j,i] <- final_matrix[i,j]
}
}
#save the matrix
library(MASS)
write.matrix(final_matrix,file="some.file")
quit()
I have a comma delimited file with 10006 lines containing expression values for 40 samples. The first column is the identifier and the last two columns are the pre-calculated means and variances. For this many rows, there would be (10005 -1)(10005 – 1) + (10005 – 1) / 2, which is 50,045,010 calculations. Let’s see how long this takes on one core of a Intel(R) Xeon(R) CPU X7560 @ 2.27GHz.
table <- read.table("level2_rename_mean_5.csv",sep=",",header=T,row.names=1)
dim(table)
[1] 10005 42
data <- as.matrix(table[,1:40])
final_matrix <- matrix(rep("0",nrow(data)*nrow(data)),ncol=nrow(data),nrow=nrow(data))
dim(final_matrix)
#[1] 10005 10005
system.time(
for (i in 1:(nrow(data)-1)){
final_matrix[i,i] <- 1
for (j in (i+1):nrow(data)){
final_matrix[i,j] <- cor(as.vector(as.matrix(data[i,])),as.vector(as.matrix(data[j,])),method="spearman")
final_matrix[j,i] <- final_matrix[i,j]
}
}
)
# user system elapsed
#100781.438 24.017 120825.286
#~33 hours
library(MASS)
write.matrix(final_matrix,file="level2_rename_mean_5.matrix")
#cat level2_rename_mean_5.matrix | wc
# 10005 100100025 2202200550
To calculate how many comparisons with respect to the number of rows
#write a function to calculate the number of comparisons
number_of_comparison <- function(n) {
(((n-1)^2) + (n-1)) / 2
}
for (i in 1:10){ print(c(i,number_of_comparison(i))) }
#[1] 1 0
#[1] 2 1
#[1] 3 3
#[1] 4 6
#[1] 5 10
#[1] 6 15
#[1] 7 21
#[1] 8 28
#[1] 9 36
#[1] 10 45
x <- array(0, dim=c(10000,1))
for (i in 1:10000){ x[i,1]<- number_of_comparison(i) }
plot(x)
tail(x)
# [,1]
#[9995,] 49945015
#[9996,] 49955010
#[9997,] 49965006
#[9998,] 49975003
#[9999,] 49985001
#[10000,] 49995000

So now what can you do with the level2_rename_mean_5.matrix file? My original intention was to create a network based on the correlations. Here’s one way of achieving this:
#!/usr/bin/perl
use strict;
use warnings;
my $threshold = '0.9';
my $infile = 'level2_rename_mean_5.matrix';
my $csv = 'level2_rename_mean_5.csv';
my @name = ();
open(IN,'<',$csv) || die "Could not open $csv: $!\n";
while(<IN>){
chomp;
next if (/raw/);
my ($id,@rest) = split(/,/);
$id =~ s/"//g;
push(@name,$id);
}
close(IN);
my %sif = ();
my $counter = '0';
open(IN,'<',$infile) || die "Could not open $infile: $!\n";
while(<IN>){
chomp;
my $current = $. - 1;
my $counter += $.;
my @cor = split();
for (my $i=$counter; $i<scalar(@cor); ++$i){
if ($cor[$i] >= $threshold){
#print join("\t",$name[$current],$name[$i],$current,$i,$cor[$i]),"\n";
print join("\t",$name[$current],'xx',$name[$i]),"\n";
}
}
}
close(IN);
exit(0);
The Perl script outputs a sif file which can then be loaded into Cytoscape.

And just for reference’s sake, here’s how to calculate the correlations of each column:
final_matrix <- matrix(rep("1",ncol(data)*ncol(data)),ncol=ncol(data),nrow=ncol(data))
#calculate all correlations and store in final_matrix
for (i in 1:(ncol(data)-1)){
for (j in (i+1):ncol(data)){
final_matrix[i,j] <- cor(as.vector(as.matrix(data[,i])),as.vector(as.matrix(data[,j])),method="spearman")
final_matrix[j,i] <- final_matrix[i,j]
}
}
#save the matrix
library(MASS)
write.matrix(final_matrix,file="some_file.tsv")
quit()