Sequence composition and random forests

Updated: 2013 November 28th

The sequence composition or the nucleotide composition at transcriptional starting sites (TSSs) of mRNAs are biased, i.e. certain nucleotides are preferred. Here I examine the sequence composition at the TSS of the NCBI Reference Sequence Database also known as RefSeq and use random forests to see if it's possible to train a classifier that can identify TSSs from random sequences. This work is done entirely using R and all code hosted on GitHub Gist.

Firstly, let's download the entire collection of RefSeqs using the R Bioconductor package biomaRt and create a data frame with the TSSs. Note that when working with transcripts, use the attributes 'transcript_start' and 'transcript_end', and not the attributes 'start_position' and 'end_position'; using those will give you the Ensembl gene coordinates.


#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
#load library
library("biomaRt")
#use ensembl mart
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
#store the filters
filters <- listFilters(ensembl)
#look for filters with the string refseq
grep('refseq', filters$name, ignore.case=T, value=T)
#[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna"
#[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted"
#[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna"
#[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted"
#store the attributes we can fetch
attributes <- listAttributes(ensembl)
#check out the first 10 attributes
head(attributes,10)
# name description
#1 ensembl_gene_id Ensembl Gene ID
#2 ensembl_transcript_id Ensembl Transcript ID
#3 ensembl_peptide_id Ensembl Protein ID
#4 ensembl_exon_id Ensembl Exon ID
#5 description Description
#6 chromosome_name Chromosome Name
#7 start_position Gene Start (bp)
#8 end_position Gene End (bp)
#9 strand Strand
#10 band Band
#create vector of chromosomes
my_chr <- c(1:22,'X','Y')
#fetch refseqs
my_refseq <- getBM(attributes='refseq_mrna',
filters = 'chromosome_name',
values = my_chr,
mart = ensembl)
#how many entries 2013 November 28th
length(my_refseq)
[1] 35423
#check out the first few
head(my_refseq)
[1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840"
[6] "NM_173505"
#I only want entries starting with a NM (curated mRNA)
my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
length(my_refseq)
[1] 33584
#check to see if only NM
head(my_refseq)
[1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505"
[6] "NM_033453"
#build attribute vector
my_attribute <- c('refseq_mrna',
'chromosome_name',
'transcript_start',
'transcript_end',
'strand')
#fetch refseqs and their chromosomal locations
my_refseq_loci <- getBM(attributes=my_attribute,
filters = c('refseq_mrna', 'chromosome_name'),
values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
mart = ensembl)
dim(my_refseq_loci)
#[1] 33657 5
#how many refseq ids are listed in multiple places?
table(duplicated(my_refseq_loci$refseq_mrna))
#FALSE TRUE
#33584 73
#get rid of the duplicated entry
my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
dim(my_refseq_loci)
#[1] 33584 5
#convert the strand into '-' and '+'
my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)
#add a 'chr' into the chromosome_name
my_refseq_loci$chromosome_name <- gsub(pattern="^",
replacement='chr',
my_refseq_loci$chromosome_name)
#we now have a data frame of all human mRNA refseqs and their chromosomal locations
head(my_refseq_loci)
# refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392 chr22 24313554 24316773 -
#2 NM_001355 chr22 24313554 24322019 -
#3 NM_138450 chr13 50202435 50208008 +
#4 NM_022840 chr18 2538452 2571485 -
#5 NM_033453 chr20 3190006 3204516 +
#6 NM_001267623 chr20 3190171 3204516 +
#I want locations of the region spanning the start of a refSeq
span <- 2
#store as another object
my_refseq_tss <- my_refseq_loci
#positive strand
#adjust the end position first, because we need the start position in our calculations
my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span
#negative strand
my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span
head(my_refseq_tss)
# refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392 chr22 24316771 24316775 -
#2 NM_001355 chr22 24322017 24322021 -
#3 NM_138450 chr13 50202433 50202437 +
#4 NM_022840 chr18 2571483 2571487 -
#5 NM_033453 chr20 3190004 3190008 +
#6 NM_001267623 chr20 3190169 3190173 +

Now let's sample a bunch of random sequences in the hg19 genome:


#how many regions to sample?
number <- 50000
#how many bp to add to start
span <- 4
#some necessary packages
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("BSgenome")
#load library
library(BSgenome)
#find available genomes
available.genomes()
#I'm interested in hg19
biocLite("BSgenome.Hsapiens.UCSC.hg19")
library(BSgenome.Hsapiens.UCSC.hg19)
#chromosomes of interest
my_chr <- c(1:22,'X','Y')
my_chr <- gsub(pattern="^", replacement='chr', my_chr)
#initialise list to store chromosome sizes
my_chr_size <- list()
for (i in my_chr){
my_chr_size[[i]] <- length(BSgenome.Hsapiens.UCSC.hg19[[i]])
}
#checkout my_chr_size
head(my_chr_size,2)
#$chr1
#[1] 249250621
#
#$chr2
#[1] 243199373
#initialise some vectors for storing random coordinates
my_random_start <- vector()
my_random_end <- vector()
my_random_chr <- vector()
my_random_strand <- vector()
set.seed(12345)
#loop through number of regions
for(i in 1:number){
my_random_chr[i] <- sample(x=my_chr,size=1)
my_random_strand[i] <- sample(x=c('-','+'),size=1)
my_max <- my_chr_size[[my_random_chr[i]]]-span
my_random_start[i] <- runif(n=1, min=1, max=my_max)
my_random_end[i] <- my_random_start[i] + span
}
my_random_loci <- data.frame(chr=my_random_chr,
start=my_random_start,
end=my_random_end,
strand=my_random_strand)
head(my_random_loci)
# chr start end strand
#1 chr18 59415403 59415407 +
#2 chr22 8535632 8535636 -
#3 chr8 106509865 106509869 +
#4 chrY 9046958 9046962 -
#5 chr18 30544079 30544083 -
#6 chr12 53873398 53873402 -

view raw

random_region.R

hosted with ❤ by GitHub

Since some of the randomly sampled regions may overlap/intersect with TSSs, I used the GenomicRanges package to remove these overlapping regions:


#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("GenomicRanges")
#load library
library(GenomicRanges)
#create a GRanges object given an object, my_refseq_loci
head(my_refseq_loci,2)
# refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392 chr22 24313554 24316773 -
#2 NM_001355 chr22 24313554 24322019 -
my_refseq_gr <- with(my_refseq_loci,
GRanges(chromosome_name,
IRanges(transcript_start, transcript_end,names=refseq_mrna),
strand
)
)
my_refseq_gr
#GRanges with 33584 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# NM_001084392 chr22 [24313554, 24316773] -
# NM_001355 chr22 [24313554, 24322019] -
# NM_138450 chr13 [50202435, 50208008] +
# NM_022840 chr18 [ 2538452, 2571485] -
# NM_033453 chr20 [ 3190006, 3204516] +
# ... ... ... ...
# NM_170783 chr6 [ 30029031, 30032686] +
# NM_014596 chr6 [ 30029036, 30032686] +
# NM_144600 chr16 [ 15959577, 15982472] -
# NM_173689 chr9 [126118539, 126141032] +
# NM_021959 chr6 [ 30034865, 30038110] +
# ---
# seqlengths:
# chr1 chr10 chr11 chr12 chr13 chr14 chr15 ... chr6 chr7 chr8 chr9 chrX chrY
# NA NA NA NA NA NA NA ... NA NA NA NA NA NA
#create another GRanges object
my_random_gr <- with(my_random_loci,
GRanges(chr,
IRanges(start, end, names=row.names(my_random_loci)),
strand
)
)
#how many overlap?
table(countOverlaps(my_random_gr, my_refseq_gr))
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
#41077 4955 2014 965 414 218 138 68 43 30 25 7 14 11
# 14 15 16 17 20 21 22 23 28 31
# 3 1 1 3 4 1 1 5 1 1
#store overlap
my_overlap <- countOverlaps(my_random_gr, my_refseq_gr)
#how many overlaps
table(my_overlap>0)
#FALSE TRUE
#41077 8923
#remove the overlapping ids from my_random_loci
my_random_loci <- my_random_loci[my_overlap==0,]
#check the dimensions
dim(my_random_loci)
#[1] 41077 4

Now to fetch the sequences and calculate the dinucleotide frequencies:


#I want to fetch sequences from
#my_random_loci and my_refseq_tss
head(my_random_loci,2)
chr start end strand
1 chr18 59415403 59415407 +
2 chr22 8535632 8535636 -
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("BSgenome.Hsapiens.UCSC.hg19")
#load if necessary
library(BSgenome.Hsapiens.UCSC.hg19)
#obtain the sequences
my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
names=my_random_loci$chr,
start=my_random_loci$start,
end=my_random_loci$end,
strand=my_random_loci$strand)
head(my_random_loci_seq)
# A DNAStringSet instance of length 6
# width seq
#[1] 5 CTCAG
#[2] 5 NNNNN
#[3] 5 CCTCA
#[4] 5 AGAAA
#[5] 5 TAATT
#[6] 5 CCCTC
#I don't want entries with N's
#how many are there?
length(grep(pattern="N",x=my_random_loci_seq))
#[1] 5150
#remove them
my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)]
#do the same for my_refseq_tss
my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
names=my_refseq_tss$chromosome_name,
start=my_refseq_tss$transcript_start,
end=my_refseq_tss$transcript_end,
strand=my_refseq_tss$strand)
head(my_refseq_tss_seq)
# A DNAStringSet instance of length 6
# width seq
#[1] 5 TTGTT
#[2] 5 GGGAT
#[3] 5 GGCAC
#[4] 5 CGCTG
#[5] 5 CAGCA
#[6] 5 GTGGC
#calculate the dinucleotide frequency
my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq)
colSums(my_random_loci_seq_di)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
#14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622
# TG TT
#10424 13611
my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq)
colSums(my_refseq_tss_seq_di)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
# 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238
# TG TT
# 8390 5546
#how about the number of combinations?
my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse='')))
head(my_random_loci_seq_di_comb)
#
#0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003
# 234 74 81 110 70
#0000000000010012
# 231
head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)])
#
#2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000
# 395 393 371 341 287
#1001000000001001
# 275
length(unique(my_random_loci_seq_di_comb))
#[1] 158
#combinations for RefSeq
my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse='')))
head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)])
#
#0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000
# 627 400 397 363 338
#0001110000000010
# 334
length(unique(my_refseq_tss_seq_di_comb))
[1] 156
#not run
#par(mfrow=c(2,1))
#plot(density(my_refseq_tss_seq_di_comb))
#plot(density(my_random_loci_seq_di_comb))
#par(mfrow=c(1,1))

view raw

get_sequence.R

hosted with ❤ by GitHub

Lastly, let's use random forests to train a predictor using the TSSs and random dinucleotide frequencies:


#install if necessary
install.packages("randomForest")
#load library
library(randomForest)
#I have two sets of dinucleotide counts stored in
#my_random_loci_seq_di and my_refseq_tss_seq_di
head(my_refseq_tss_seq_di,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#[1,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2
#[2,] 0 0 0 1 0 0 0 0 1 0 2 0 0 0 0 0
head(my_random_loci_seq_di,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#[1,] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0
#[2,] 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0
#sample a set for training from the refseq set
set.seed(23456)
my_refseq_sample_index <- sample(
x=1:nrow(my_refseq_tss_seq_di),
size=round(nrow(my_refseq_tss_seq_di)/2),
replace=F)
head(my_refseq_sample_index)
#[1] 15687 25731 23143 30129 26312 15763
#store the sample
my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,])
#create tss class
my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2))
head(my_refseq_sample,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss
#sample a set for training from the random set
set.seed(23456)
my_random_sample_index <- sample(
x=1:nrow(my_random_loci_seq_di),
size=round(nrow(my_random_loci_seq_di)/2),
replace=F)
#store the sample
my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,])
#create random class
my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2))
#combine the two samples
train <- rbind(my_refseq_sample,my_random_sample)
#set the class as a factor
train$class <- as.factor(train$class)
#check out train
dim(train)
#[1] 34756 17
head(train,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss
tail(train,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#34755 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 random
#34756 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2 0 random
#training
r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100)
#ntree OOB 1 2
# 100: 30.64% 25.14% 36.52%
# 200: 30.51% 24.96% 36.46%
# 300: 30.40% 24.85% 36.33%
# 400: 30.47% 24.98% 36.34%
# 500: 30.47% 24.89% 36.44%
r
#Call:
# randomForest(formula = class ~ ., data = train, importance = TRUE, do.trace = 100)
# Type of random forest: classification
# Number of trees: 500
#No. of variables tried at each split: 4
#
# OOB estimate of error rate: 30.47%
#Confusion matrix:
# random tss class.error
#random 13493 4471 0.2488867
#tss 6119 10673 0.3643997
#install if necessary
install.packages('pROC')
#load library
library(pROC)
#check roc
roc(r$y, r$votes)
#Call:
#roc.default(response = r$y, predictor = r$votes)
#
#Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss).
#Area under the curve: 0.5
#check ci
ci.auc(r$y, r$votes)
#check importance
rn <- round(importance(r), 2)
rn[order(rn[,3], decreasing=TRUE),]
# random tss MeanDecreaseAccuracy MeanDecreaseGini
#CG 96.82 32.57 76.12 1143.27
#TG 15.52 43.58 69.24 186.87
#GG -0.22 51.59 61.87 211.40
#GC 39.09 24.54 59.69 391.97
#CT 19.01 28.02 58.98 127.87
#CA -27.72 51.29 58.50 152.47
#CC 6.32 45.02 57.33 191.00
#AC 8.66 33.77 55.31 102.66
#GT 13.95 28.26 55.19 111.89
#AT -16.42 55.89 54.25 206.15
#TC 23.27 17.44 53.59 105.38
#TA -16.51 58.66 53.50 261.05
#AA 23.41 34.41 52.59 311.58
#AG -14.48 38.80 51.53 119.44
#GA -14.70 36.49 46.55 84.82
#TT 12.32 38.70 43.29 274.56
#plot ROC
#install if necessary
install.packages('verification')
#load library
library(verification)
aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A
roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="")
legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
title(main="OOB ROC Curve Random Forest for predicting TSS")
#creating the test set
my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,])
my_random_test$class <- rep('random',nrow(my_random_test))
my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,])
my_refseq_test$class <- rep('tss',nrow(my_refseq_test))
test <- rbind(my_refseq_test,my_random_test)
test$class <- as.factor(test$class)
dim(test)
#[1] 34755 17
data.predict <- predict(r, test)
t = table(observed=test[,'class'], predict=data.predict)
prop.table(t,1)
# predict
#observed random tss
# random 0.7639592 0.2360408
# tss 0.3632087 0.6367913

view raw

random_forest.R

hosted with ❤ by GitHub

oob_roc

Conclusions

When I first calculated the dinucleotide frequencies, the distributions were clearly different between the random regions and the TSSes:

colSums(my_random_loci_seq_di)
#   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG    GT    TA    TC 
#14142  7386  9968 11071 10667  7813  1401 10098  8640  6150  7471  7147  9097  8622 
#   TG    TT 
#10424 13611 
 
my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq)
colSums(my_refseq_tss_seq_di)
#   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG    GT    TA    TC 
# 5868  5941 10382  5275 10694 11895  9787  8117  8649 13169 13423  6469  3493  7238 
#   TG    TT 
# 8390  5546 

The CG difference pops out immediately, with a much lower frequency in the random regions. Indeed when calculating the importance of the variables, the CG dinucleotide was at the top.

Also the predictor was much better at calling random sites random, than TSS sites TSSes:

prop.table(t,1)
        predict
observed    random       tss
  random 0.7639592 0.2360408
  tss    0.3632087 0.6367913

The RefSeq collection of mRNA sequences is quite small when compared to UCSC genes or GENCODE. So it may be that I happened to sample regions that were the start of mRNAs missing in RefSeq.

I actually made the effort to write the code so that I could easily adjust some parameters and redo the entire analysis. So, I thought, what if I increased the region I used to train the classifier? I used 5 bp windows and had an error rate of ~31%. I increased this region to a 9 bp window and redid the entire analysis; this was the result:

prop.table(t,1)
        predict
observed    random       tss
  random 0.8470255 0.1529745
  tss    0.3877674 0.6122326

I increased the accuracy of calling a random region, random. However, the false positive rate of TSSes was almost the same and I wonder if the main reason is that I sampled mRNAs not included in RefSeq.

Positional information

One piece of information regarding the sequence composition that is lost, is the positional dinucleotide frequency, i.e. the dinucleotide frequency at each position of the string. The Biostrings package did have a nucleotideFrequencyAt() function, however it would only return the single nucleotide frequency at specific sites. So below is my implementation of calculating positional dinucleotide frequencies:

#quick demo of the nucleotideFrequencyAt() function
#get the nucleotide frequency at -1
nucleotideFrequencyAt(my_refseq_tss_seq,10)
    A     C     G     T 
 4543 14755  6628  7657
#at the TSS
nucleotideFrequencyAt(my_refseq_tss_seq,11)
    A     C     G     T 
12094  6621 11744  3124
#at +1
nucleotideFrequencyAt(my_refseq_tss_seq,12)
    A     C     G     T 
 6650  8376 10502  8055

#for this example I increased the span to 10
#so I have strings of length 21 that represent the TSS of refSeqs
#follow the code above to get my_refseq_tss_seq
head(my_refseq_tss_seq)
  A DNAStringSet instance of length 6
    width seq
[1]    21 AGAGGCTGTGATCCCTTCAAG
[2]    21 AGAGGCTGTGATCCCTTCAAG
[3]    21 CTCCTTCAGGCACCCAAGTCT
[4]    21 GCGCGCAGTAACTTCCATGCG
[5]    21 AGAGGTTCAGCAGACCCACCT
[6]    21 AGAGGTTCAGCAGACCCACCT
#remove any entries with N
my_refseq_tss_seq <- my_refseq_tss_seq[-grep(pattern="N",x=my_refseq_tss_seq)]

#initialise data frame
my_refseq_tss_dinuc_pos <- data.frame()

#calculate dinucleotide at position 2-3
tmp <- table(substr(my_refseq_tss_seq, 2, 3))
tmp

  AA   AC   AG   AT   CA   CC   CG   CT   GA   GC   GG   GT   TA   TC   TG   TT 
1480 1449 2420  766 1919 3648 2697 2219 1889 3378 3846 1478  689 1968 2013 1724

#calculate max_length for loop
max_length <- length(my_refseq_tss_seq[[1]]) - 1

for (i in 1:max_length){
  my_refseq_tss_dinuc_pos[1:16,i] <- as.vector(round(table(substr(my_refseq_tss_seq, i, i+1))/sum(tmp),3))
}

#create column names
relative_tss <- vector()
left <- -10
for (i in 0:19){
  first <- left + i
  second <- left + i + 1
  name <- paste(c(first, second),collapse='/')
  relative_tss[i+1] <- name
}

relative_tss
 [1] "-10/-9" "-9/-8"  "-8/-7"  "-7/-6"  "-6/-5"  "-5/-4"  "-4/-3"  "-3/-2"  "-2/-1"  "-1/0"   "0/1"    "1/2"    "2/3"   
[14] "3/4"    "4/5"    "5/6"    "6/7"    "7/8"    "8/9"    "9/10"

#rename columns
colnames(my_refseq_tss_dinuc_pos)  <- relative_tss
#rename row names to the dinucleotides
row.names(my_refseq_tss_dinuc_pos) <- names(tmp)
my_refseq_tss_dinuc_pos
   -10/-9 -9/-8 -8/-7 -7/-6 -6/-5 -5/-4 -4/-3 -3/-2 -2/-1  -1/0   0/1   1/2   2/3   3/4   4/5   5/6   6/7   7/8   8/9  9/10
AA  0.044 0.044 0.043 0.041 0.043 0.046 0.048 0.037 0.031 0.030 0.078 0.057 0.055 0.054 0.051 0.045 0.044 0.043 0.048 0.048
AC  0.042 0.043 0.041 0.042 0.036 0.045 0.042 0.041 0.052 0.027 0.071 0.041 0.045 0.046 0.047 0.044 0.039 0.044 0.038 0.043
AG  0.074 0.072 0.068 0.075 0.065 0.063 0.078 0.068 0.048 0.061 0.128 0.064 0.081 0.075 0.077 0.070 0.073 0.071 0.078 0.082
AT  0.025 0.023 0.026 0.027 0.026 0.026 0.028 0.029 0.031 0.018 0.082 0.035 0.036 0.035 0.032 0.034 0.028 0.026 0.025 0.030
CA  0.055 0.057 0.056 0.055 0.052 0.064 0.052 0.049 0.043 0.207 0.033 0.059 0.059 0.058 0.059 0.058 0.063 0.061 0.066 0.058
CC  0.109 0.109 0.107 0.107 0.107 0.101 0.108 0.111 0.160 0.083 0.054 0.063 0.070 0.081 0.098 0.095 0.099 0.104 0.095 0.089
CG  0.076 0.080 0.078 0.079 0.071 0.067 0.083 0.070 0.037 0.113 0.049 0.060 0.067 0.064 0.070 0.088 0.074 0.079 0.080 0.077
CT  0.071 0.066 0.070 0.067 0.077 0.066 0.078 0.073 0.083 0.037 0.061 0.068 0.065 0.063 0.066 0.069 0.067 0.069 0.070 0.066
GA  0.060 0.056 0.063 0.054 0.062 0.064 0.055 0.055 0.043 0.057 0.076 0.074 0.065 0.065 0.061 0.059 0.054 0.062 0.068 0.063
GC  0.100 0.101 0.104 0.104 0.100 0.103 0.092 0.105 0.133 0.047 0.098 0.094 0.089 0.106 0.104 0.099 0.110 0.107 0.100 0.111
GG  0.110 0.115 0.115 0.121 0.121 0.104 0.106 0.113 0.079 0.076 0.111 0.093 0.097 0.092 0.089 0.099 0.112 0.108 0.107 0.110
GT  0.044 0.044 0.045 0.042 0.053 0.043 0.042 0.050 0.053 0.018 0.065 0.051 0.042 0.047 0.040 0.041 0.041 0.046 0.041 0.044
TA  0.023 0.021 0.023 0.021 0.023 0.023 0.020 0.021 0.018 0.067 0.011 0.027 0.031 0.028 0.023 0.023 0.023 0.022 0.022 0.019
TC  0.061 0.059 0.057 0.054 0.055 0.072 0.060 0.066 0.094 0.041 0.026 0.063 0.063 0.060 0.062 0.064 0.064 0.056 0.059 0.058
TG  0.055 0.060 0.059 0.061 0.057 0.062 0.056 0.057 0.033 0.100 0.025 0.075 0.066 0.063 0.062 0.061 0.063 0.058 0.062 0.061
TT  0.050 0.051 0.046 0.051 0.052 0.052 0.051 0.055 0.061 0.020 0.031 0.076 0.070 0.062 0.062 0.051 0.045 0.045 0.042 0.041

#sanity check
colSums(my_refseq_tss_dinuc_pos)
-10/-9  -9/-8  -8/-7  -7/-6  -6/-5  -5/-4  -4/-3  -3/-2  -2/-1   -1/0    0/1    1/2    2/3    3/4    4/5    5/6    6/7 
 0.999  1.001  1.001  1.001  1.000  1.001  0.999  1.000  0.999  1.002  0.999  1.000  1.001  0.999  1.003  1.000  0.999 
   7/8    8/9   9/10 
 1.001  1.001  1.000

The position dinucleotide biases from RefSeqs are comparable to those calculated dinucleotide frequencies around TSSs from CAGE data.

Visualising

We can use the nucleotideFrequencyAt() function to calculate the nucleotide frequencies and to create a sequence logo:

#initialise data frame
my_refseq_tss_nuc_pos <- data.frame()

tmp <- nucleotideFrequencyAt(my_refseq_tss_seq,1)
tmp
    A     C     G     T 
 6193 10469 10563  6358

for (i in 1:21){
  my_refseq_tss_nuc_pos[1:4,i] <- nucleotideFrequencyAt(my_refseq_tss_seq,i)
}

row.names(my_refseq_tss_nuc_pos) <- names(tmp)
colnames(my_refseq_tss_nuc_pos) <- c(1:21)
my_refseq_tss_nuc_pos
      1     2     3     4     5     6     7     8     9    10    11    12   13    14   15    16    17    18    19    20
A  6193  6115  5977  6207  5731  6044  6612  5872  5440  4543 12094  6650 7295  7037 6901  6465  6204  6176  6340  6835
C 10469 10483 10443 10347 10286  9999 10769 10162 10867 14755  6621  8376 8749  8954 9851 10427 10173 10483 10438  9763
G 10563 10591 10976 10735 11280 10542  9941 10852 10345  6628 11744 10502 9821 10415 9859 10000 10679 10833 10599 10994
T  6358  6394  6187  6294  6286  6998  6261  6697  6931  7657  3124  8055 7718  7177 6972  6691  6527  6091  6206  5991
     21
A  6291
C 10099
G 11106
T  6087

df <- t(my_refseq_tss_nuc_pos)
df
       A     C     G    T
1   6193 10469 10563 6358
2   6115 10483 10591 6394
3   5977 10443 10976 6187
4   6207 10347 10735 6294
5   5731 10286 11280 6286
6   6044  9999 10542 6998
7   6612 10769  9941 6261
8   5872 10162 10852 6697
9   5440 10867 10345 6931
10  4543 14755  6628 7657
11 12094  6621 11744 3124
12  6650  8376 10502 8055
13  7295  8749  9821 7718
14  7037  8954 10415 7177
15  6901  9851  9859 6972
16  6465 10427 10000 6691
17  6204 10173 10679 6527
18  6176 10483 10833 6091
19  6340 10438 10599 6206
20  6835  9763 10994 5991
21  6291 10099 11106 6087

#define function that divides the frequency by the row sum i.e. proportions
proportion <- function(x){
   rs <- sum(x);
   return(x / rs);
}

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("seqLogo")
 
library(seqLogo)

pwm <- apply(df, 1, proportion)
pwm <- makePWM(pwm)
seqLogo(pwm=pwm, ic.scale=F)

nucleotide_biasEach column is NOT proportional to its information content, but just the distribution of nucleotides.

Print Friendly, PDF & Email



Creative Commons License
This work is licensed under a Creative Commons
Attribution 4.0 International License
.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.