Updated: 2013 November 28th
The sequence composition or the nucleotide composition at transcriptional starting sites (TSSs) of mRNAs are biased, i.e. certain nucleotides are preferred. Here I examine the sequence composition at the TSS of the NCBI Reference Sequence Database also known as RefSeq and use random forests to see if it's possible to train a classifier that can identify TSSs from random sequences. This work is done entirely using R and all code hosted on GitHub Gist.
Firstly, let's download the entire collection of RefSeqs using the R Bioconductor package biomaRt and create a data frame with the TSSs. Note that when working with transcripts, use the attributes 'transcript_start' and 'transcript_end', and not the attributes 'start_position' and 'end_position'; using those will give you the Ensembl gene coordinates.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #install if necessary | |
| source("http://bioconductor.org/biocLite.R") | |
| biocLite("biomaRt") | |
| #load library | |
| library("biomaRt") | |
| #use ensembl mart | |
| ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl") | |
| #store the filters | |
| filters <- listFilters(ensembl) | |
| #look for filters with the string refseq | |
| grep('refseq', filters$name, ignore.case=T, value=T) | |
| #[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna" | |
| #[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted" | |
| #[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna" | |
| #[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted" | |
| #store the attributes we can fetch | |
| attributes <- listAttributes(ensembl) | |
| #check out the first 10 attributes | |
| head(attributes,10) | |
| # name description | |
| #1 ensembl_gene_id Ensembl Gene ID | |
| #2 ensembl_transcript_id Ensembl Transcript ID | |
| #3 ensembl_peptide_id Ensembl Protein ID | |
| #4 ensembl_exon_id Ensembl Exon ID | |
| #5 description Description | |
| #6 chromosome_name Chromosome Name | |
| #7 start_position Gene Start (bp) | |
| #8 end_position Gene End (bp) | |
| #9 strand Strand | |
| #10 band Band | |
| #create vector of chromosomes | |
| my_chr <- c(1:22,'X','Y') | |
| #fetch refseqs | |
| my_refseq <- getBM(attributes='refseq_mrna', | |
| filters = 'chromosome_name', | |
| values = my_chr, | |
| mart = ensembl) | |
| #how many entries 2013 November 28th | |
| length(my_refseq) | |
| [1] 35423 | |
| #check out the first few | |
| head(my_refseq) | |
| [1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840" | |
| [6] "NM_173505" | |
| #I only want entries starting with a NM (curated mRNA) | |
| my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)] | |
| length(my_refseq) | |
| [1] 33584 | |
| #check to see if only NM | |
| head(my_refseq) | |
| [1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505" | |
| [6] "NM_033453" | |
| #build attribute vector | |
| my_attribute <- c('refseq_mrna', | |
| 'chromosome_name', | |
| 'transcript_start', | |
| 'transcript_end', | |
| 'strand') | |
| #fetch refseqs and their chromosomal locations | |
| my_refseq_loci <- getBM(attributes=my_attribute, | |
| filters = c('refseq_mrna', 'chromosome_name'), | |
| values = list(refseq_mrna=my_refseq, chromosome_name=my_chr), | |
| mart = ensembl) | |
| dim(my_refseq_loci) | |
| #[1] 33657 5 | |
| #how many refseq ids are listed in multiple places? | |
| table(duplicated(my_refseq_loci$refseq_mrna)) | |
| #FALSE TRUE | |
| #33584 73 | |
| #get rid of the duplicated entry | |
| my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),] | |
| dim(my_refseq_loci) | |
| #[1] 33584 5 | |
| #convert the strand into '-' and '+' | |
| my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand) | |
| my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand) | |
| #add a 'chr' into the chromosome_name | |
| my_refseq_loci$chromosome_name <- gsub(pattern="^", | |
| replacement='chr', | |
| my_refseq_loci$chromosome_name) | |
| #we now have a data frame of all human mRNA refseqs and their chromosomal locations | |
| head(my_refseq_loci) | |
| # refseq_mrna chromosome_name transcript_start transcript_end strand | |
| #1 NM_001084392 chr22 24313554 24316773 - | |
| #2 NM_001355 chr22 24313554 24322019 - | |
| #3 NM_138450 chr13 50202435 50208008 + | |
| #4 NM_022840 chr18 2538452 2571485 - | |
| #5 NM_033453 chr20 3190006 3204516 + | |
| #6 NM_001267623 chr20 3190171 3204516 + | |
| #I want locations of the region spanning the start of a refSeq | |
| span <- 2 | |
| #store as another object | |
| my_refseq_tss <- my_refseq_loci | |
| #positive strand | |
| #adjust the end position first, because we need the start position in our calculations | |
| my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span | |
| my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span | |
| #negative strand | |
| my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span | |
| my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span | |
| head(my_refseq_tss) | |
| # refseq_mrna chromosome_name transcript_start transcript_end strand | |
| #1 NM_001084392 chr22 24316771 24316775 - | |
| #2 NM_001355 chr22 24322017 24322021 - | |
| #3 NM_138450 chr13 50202433 50202437 + | |
| #4 NM_022840 chr18 2571483 2571487 - | |
| #5 NM_033453 chr20 3190004 3190008 + | |
| #6 NM_001267623 chr20 3190169 3190173 + |
Now let's sample a bunch of random sequences in the hg19 genome:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #how many regions to sample? | |
| number <- 50000 | |
| #how many bp to add to start | |
| span <- 4 | |
| #some necessary packages | |
| #install if necessary | |
| source("http://bioconductor.org/biocLite.R") | |
| biocLite("BSgenome") | |
| #load library | |
| library(BSgenome) | |
| #find available genomes | |
| available.genomes() | |
| #I'm interested in hg19 | |
| biocLite("BSgenome.Hsapiens.UCSC.hg19") | |
| library(BSgenome.Hsapiens.UCSC.hg19) | |
| #chromosomes of interest | |
| my_chr <- c(1:22,'X','Y') | |
| my_chr <- gsub(pattern="^", replacement='chr', my_chr) | |
| #initialise list to store chromosome sizes | |
| my_chr_size <- list() | |
| for (i in my_chr){ | |
| my_chr_size[[i]] <- length(BSgenome.Hsapiens.UCSC.hg19[[i]]) | |
| } | |
| #checkout my_chr_size | |
| head(my_chr_size,2) | |
| #$chr1 | |
| #[1] 249250621 | |
| # | |
| #$chr2 | |
| #[1] 243199373 | |
| #initialise some vectors for storing random coordinates | |
| my_random_start <- vector() | |
| my_random_end <- vector() | |
| my_random_chr <- vector() | |
| my_random_strand <- vector() | |
| set.seed(12345) | |
| #loop through number of regions | |
| for(i in 1:number){ | |
| my_random_chr[i] <- sample(x=my_chr,size=1) | |
| my_random_strand[i] <- sample(x=c('-','+'),size=1) | |
| my_max <- my_chr_size[[my_random_chr[i]]]-span | |
| my_random_start[i] <- runif(n=1, min=1, max=my_max) | |
| my_random_end[i] <- my_random_start[i] + span | |
| } | |
| my_random_loci <- data.frame(chr=my_random_chr, | |
| start=my_random_start, | |
| end=my_random_end, | |
| strand=my_random_strand) | |
| head(my_random_loci) | |
| # chr start end strand | |
| #1 chr18 59415403 59415407 + | |
| #2 chr22 8535632 8535636 - | |
| #3 chr8 106509865 106509869 + | |
| #4 chrY 9046958 9046962 - | |
| #5 chr18 30544079 30544083 - | |
| #6 chr12 53873398 53873402 - |
Since some of the randomly sampled regions may overlap/intersect with TSSs, I used the GenomicRanges package to remove these overlapping regions:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #install if necessary | |
| source("http://bioconductor.org/biocLite.R") | |
| biocLite("GenomicRanges") | |
| #load library | |
| library(GenomicRanges) | |
| #create a GRanges object given an object, my_refseq_loci | |
| head(my_refseq_loci,2) | |
| # refseq_mrna chromosome_name transcript_start transcript_end strand | |
| #1 NM_001084392 chr22 24313554 24316773 - | |
| #2 NM_001355 chr22 24313554 24322019 - | |
| my_refseq_gr <- with(my_refseq_loci, | |
| GRanges(chromosome_name, | |
| IRanges(transcript_start, transcript_end,names=refseq_mrna), | |
| strand | |
| ) | |
| ) | |
| my_refseq_gr | |
| #GRanges with 33584 ranges and 0 metadata columns: | |
| # seqnames ranges strand | |
| # <Rle> <IRanges> <Rle> | |
| # NM_001084392 chr22 [24313554, 24316773] - | |
| # NM_001355 chr22 [24313554, 24322019] - | |
| # NM_138450 chr13 [50202435, 50208008] + | |
| # NM_022840 chr18 [ 2538452, 2571485] - | |
| # NM_033453 chr20 [ 3190006, 3204516] + | |
| # ... ... ... ... | |
| # NM_170783 chr6 [ 30029031, 30032686] + | |
| # NM_014596 chr6 [ 30029036, 30032686] + | |
| # NM_144600 chr16 [ 15959577, 15982472] - | |
| # NM_173689 chr9 [126118539, 126141032] + | |
| # NM_021959 chr6 [ 30034865, 30038110] + | |
| # --- | |
| # seqlengths: | |
| # chr1 chr10 chr11 chr12 chr13 chr14 chr15 ... chr6 chr7 chr8 chr9 chrX chrY | |
| # NA NA NA NA NA NA NA ... NA NA NA NA NA NA | |
| #create another GRanges object | |
| my_random_gr <- with(my_random_loci, | |
| GRanges(chr, | |
| IRanges(start, end, names=row.names(my_random_loci)), | |
| strand | |
| ) | |
| ) | |
| #how many overlap? | |
| table(countOverlaps(my_random_gr, my_refseq_gr)) | |
| # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 | |
| #41077 4955 2014 965 414 218 138 68 43 30 25 7 14 11 | |
| # 14 15 16 17 20 21 22 23 28 31 | |
| # 3 1 1 3 4 1 1 5 1 1 | |
| #store overlap | |
| my_overlap <- countOverlaps(my_random_gr, my_refseq_gr) | |
| #how many overlaps | |
| table(my_overlap>0) | |
| #FALSE TRUE | |
| #41077 8923 | |
| #remove the overlapping ids from my_random_loci | |
| my_random_loci <- my_random_loci[my_overlap==0,] | |
| #check the dimensions | |
| dim(my_random_loci) | |
| #[1] 41077 4 |
Now to fetch the sequences and calculate the dinucleotide frequencies:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #I want to fetch sequences from | |
| #my_random_loci and my_refseq_tss | |
| head(my_random_loci,2) | |
| chr start end strand | |
| 1 chr18 59415403 59415407 + | |
| 2 chr22 8535632 8535636 - | |
| #install if necessary | |
| source("http://bioconductor.org/biocLite.R") | |
| biocLite("BSgenome.Hsapiens.UCSC.hg19") | |
| #load if necessary | |
| library(BSgenome.Hsapiens.UCSC.hg19) | |
| #obtain the sequences | |
| my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, | |
| names=my_random_loci$chr, | |
| start=my_random_loci$start, | |
| end=my_random_loci$end, | |
| strand=my_random_loci$strand) | |
| head(my_random_loci_seq) | |
| # A DNAStringSet instance of length 6 | |
| # width seq | |
| #[1] 5 CTCAG | |
| #[2] 5 NNNNN | |
| #[3] 5 CCTCA | |
| #[4] 5 AGAAA | |
| #[5] 5 TAATT | |
| #[6] 5 CCCTC | |
| #I don't want entries with N's | |
| #how many are there? | |
| length(grep(pattern="N",x=my_random_loci_seq)) | |
| #[1] 5150 | |
| #remove them | |
| my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)] | |
| #do the same for my_refseq_tss | |
| my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, | |
| names=my_refseq_tss$chromosome_name, | |
| start=my_refseq_tss$transcript_start, | |
| end=my_refseq_tss$transcript_end, | |
| strand=my_refseq_tss$strand) | |
| head(my_refseq_tss_seq) | |
| # A DNAStringSet instance of length 6 | |
| # width seq | |
| #[1] 5 TTGTT | |
| #[2] 5 GGGAT | |
| #[3] 5 GGCAC | |
| #[4] 5 CGCTG | |
| #[5] 5 CAGCA | |
| #[6] 5 GTGGC | |
| #calculate the dinucleotide frequency | |
| my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq) | |
| colSums(my_random_loci_seq_di) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC | |
| #14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622 | |
| # TG TT | |
| #10424 13611 | |
| my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq) | |
| colSums(my_refseq_tss_seq_di) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC | |
| # 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238 | |
| # TG TT | |
| # 8390 5546 | |
| #how about the number of combinations? | |
| my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse=''))) | |
| head(my_random_loci_seq_di_comb) | |
| # | |
| #0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003 | |
| # 234 74 81 110 70 | |
| #0000000000010012 | |
| # 231 | |
| head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)]) | |
| # | |
| #2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000 | |
| # 395 393 371 341 287 | |
| #1001000000001001 | |
| # 275 | |
| length(unique(my_random_loci_seq_di_comb)) | |
| #[1] 158 | |
| #combinations for RefSeq | |
| my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse=''))) | |
| head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)]) | |
| # | |
| #0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000 | |
| # 627 400 397 363 338 | |
| #0001110000000010 | |
| # 334 | |
| length(unique(my_refseq_tss_seq_di_comb)) | |
| [1] 156 | |
| #not run | |
| #par(mfrow=c(2,1)) | |
| #plot(density(my_refseq_tss_seq_di_comb)) | |
| #plot(density(my_random_loci_seq_di_comb)) | |
| #par(mfrow=c(1,1)) |
Lastly, let's use random forests to train a predictor using the TSSs and random dinucleotide frequencies:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #install if necessary | |
| install.packages("randomForest") | |
| #load library | |
| library(randomForest) | |
| #I have two sets of dinucleotide counts stored in | |
| #my_random_loci_seq_di and my_refseq_tss_seq_di | |
| head(my_refseq_tss_seq_di,2) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT | |
| #[1,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 | |
| #[2,] 0 0 0 1 0 0 0 0 1 0 2 0 0 0 0 0 | |
| head(my_random_loci_seq_di,2) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT | |
| #[1,] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 | |
| #[2,] 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 | |
| #sample a set for training from the refseq set | |
| set.seed(23456) | |
| my_refseq_sample_index <- sample( | |
| x=1:nrow(my_refseq_tss_seq_di), | |
| size=round(nrow(my_refseq_tss_seq_di)/2), | |
| replace=F) | |
| head(my_refseq_sample_index) | |
| #[1] 15687 25731 23143 30129 26312 15763 | |
| #store the sample | |
| my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,]) | |
| #create tss class | |
| my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2)) | |
| head(my_refseq_sample,2) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
| #1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss | |
| #2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss | |
| #sample a set for training from the random set | |
| set.seed(23456) | |
| my_random_sample_index <- sample( | |
| x=1:nrow(my_random_loci_seq_di), | |
| size=round(nrow(my_random_loci_seq_di)/2), | |
| replace=F) | |
| #store the sample | |
| my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,]) | |
| #create random class | |
| my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2)) | |
| #combine the two samples | |
| train <- rbind(my_refseq_sample,my_random_sample) | |
| #set the class as a factor | |
| train$class <- as.factor(train$class) | |
| #check out train | |
| dim(train) | |
| #[1] 34756 17 | |
| head(train,2) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
| #1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss | |
| #2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss | |
| tail(train,2) | |
| # AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
| #34755 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 random | |
| #34756 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2 0 random | |
| #training | |
| r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100) | |
| #ntree OOB 1 2 | |
| # 100: 30.64% 25.14% 36.52% | |
| # 200: 30.51% 24.96% 36.46% | |
| # 300: 30.40% 24.85% 36.33% | |
| # 400: 30.47% 24.98% 36.34% | |
| # 500: 30.47% 24.89% 36.44% | |
| r | |
| #Call: | |
| # randomForest(formula = class ~ ., data = train, importance = TRUE, do.trace = 100) | |
| # Type of random forest: classification | |
| # Number of trees: 500 | |
| #No. of variables tried at each split: 4 | |
| # | |
| # OOB estimate of error rate: 30.47% | |
| #Confusion matrix: | |
| # random tss class.error | |
| #random 13493 4471 0.2488867 | |
| #tss 6119 10673 0.3643997 | |
| #install if necessary | |
| install.packages('pROC') | |
| #load library | |
| library(pROC) | |
| #check roc | |
| roc(r$y, r$votes) | |
| #Call: | |
| #roc.default(response = r$y, predictor = r$votes) | |
| # | |
| #Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss). | |
| #Area under the curve: 0.5 | |
| #check ci | |
| ci.auc(r$y, r$votes) | |
| #check importance | |
| rn <- round(importance(r), 2) | |
| rn[order(rn[,3], decreasing=TRUE),] | |
| # random tss MeanDecreaseAccuracy MeanDecreaseGini | |
| #CG 96.82 32.57 76.12 1143.27 | |
| #TG 15.52 43.58 69.24 186.87 | |
| #GG -0.22 51.59 61.87 211.40 | |
| #GC 39.09 24.54 59.69 391.97 | |
| #CT 19.01 28.02 58.98 127.87 | |
| #CA -27.72 51.29 58.50 152.47 | |
| #CC 6.32 45.02 57.33 191.00 | |
| #AC 8.66 33.77 55.31 102.66 | |
| #GT 13.95 28.26 55.19 111.89 | |
| #AT -16.42 55.89 54.25 206.15 | |
| #TC 23.27 17.44 53.59 105.38 | |
| #TA -16.51 58.66 53.50 261.05 | |
| #AA 23.41 34.41 52.59 311.58 | |
| #AG -14.48 38.80 51.53 119.44 | |
| #GA -14.70 36.49 46.55 84.82 | |
| #TT 12.32 38.70 43.29 274.56 | |
| #plot ROC | |
| #install if necessary | |
| install.packages('verification') | |
| #load library | |
| library(verification) | |
| aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A | |
| roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="") | |
| legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc)) | |
| title(main="OOB ROC Curve Random Forest for predicting TSS") | |
| #creating the test set | |
| my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,]) | |
| my_random_test$class <- rep('random',nrow(my_random_test)) | |
| my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,]) | |
| my_refseq_test$class <- rep('tss',nrow(my_refseq_test)) | |
| test <- rbind(my_refseq_test,my_random_test) | |
| test$class <- as.factor(test$class) | |
| dim(test) | |
| #[1] 34755 17 | |
| data.predict <- predict(r, test) | |
| t = table(observed=test[,'class'], predict=data.predict) | |
| prop.table(t,1) | |
| # predict | |
| #observed random tss | |
| # random 0.7639592 0.2360408 | |
| # tss 0.3632087 0.6367913 |
Conclusions
When I first calculated the dinucleotide frequencies, the distributions were clearly different between the random regions and the TSSes:
colSums(my_random_loci_seq_di) # AA AC AG AT CA CC CG CT GA GC GG GT TA TC #14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622 # TG TT #10424 13611 my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq) colSums(my_refseq_tss_seq_di) # AA AC AG AT CA CC CG CT GA GC GG GT TA TC # 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238 # TG TT # 8390 5546
The CG difference pops out immediately, with a much lower frequency in the random regions. Indeed when calculating the importance of the variables, the CG dinucleotide was at the top.
Also the predictor was much better at calling random sites random, than TSS sites TSSes:
prop.table(t,1)
predict
observed random tss
random 0.7639592 0.2360408
tss 0.3632087 0.6367913
The RefSeq collection of mRNA sequences is quite small when compared to UCSC genes or GENCODE. So it may be that I happened to sample regions that were the start of mRNAs missing in RefSeq.
I actually made the effort to write the code so that I could easily adjust some parameters and redo the entire analysis. So, I thought, what if I increased the region I used to train the classifier? I used 5 bp windows and had an error rate of ~31%. I increased this region to a 9 bp window and redid the entire analysis; this was the result:
prop.table(t,1)
predict
observed random tss
random 0.8470255 0.1529745
tss 0.3877674 0.6122326
I increased the accuracy of calling a random region, random. However, the false positive rate of TSSes was almost the same and I wonder if the main reason is that I sampled mRNAs not included in RefSeq.
Positional information
One piece of information regarding the sequence composition that is lost, is the positional dinucleotide frequency, i.e. the dinucleotide frequency at each position of the string. The Biostrings package did have a nucleotideFrequencyAt() function, however it would only return the single nucleotide frequency at specific sites. So below is my implementation of calculating positional dinucleotide frequencies:
#quick demo of the nucleotideFrequencyAt() function
#get the nucleotide frequency at -1
nucleotideFrequencyAt(my_refseq_tss_seq,10)
A C G T
4543 14755 6628 7657
#at the TSS
nucleotideFrequencyAt(my_refseq_tss_seq,11)
A C G T
12094 6621 11744 3124
#at +1
nucleotideFrequencyAt(my_refseq_tss_seq,12)
A C G T
6650 8376 10502 8055
#for this example I increased the span to 10
#so I have strings of length 21 that represent the TSS of refSeqs
#follow the code above to get my_refseq_tss_seq
head(my_refseq_tss_seq)
A DNAStringSet instance of length 6
width seq
[1] 21 AGAGGCTGTGATCCCTTCAAG
[2] 21 AGAGGCTGTGATCCCTTCAAG
[3] 21 CTCCTTCAGGCACCCAAGTCT
[4] 21 GCGCGCAGTAACTTCCATGCG
[5] 21 AGAGGTTCAGCAGACCCACCT
[6] 21 AGAGGTTCAGCAGACCCACCT
#remove any entries with N
my_refseq_tss_seq <- my_refseq_tss_seq[-grep(pattern="N",x=my_refseq_tss_seq)]
#initialise data frame
my_refseq_tss_dinuc_pos <- data.frame()
#calculate dinucleotide at position 2-3
tmp <- table(substr(my_refseq_tss_seq, 2, 3))
tmp
AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
1480 1449 2420 766 1919 3648 2697 2219 1889 3378 3846 1478 689 1968 2013 1724
#calculate max_length for loop
max_length <- length(my_refseq_tss_seq[[1]]) - 1
for (i in 1:max_length){
my_refseq_tss_dinuc_pos[1:16,i] <- as.vector(round(table(substr(my_refseq_tss_seq, i, i+1))/sum(tmp),3))
}
#create column names
relative_tss <- vector()
left <- -10
for (i in 0:19){
first <- left + i
second <- left + i + 1
name <- paste(c(first, second),collapse='/')
relative_tss[i+1] <- name
}
relative_tss
[1] "-10/-9" "-9/-8" "-8/-7" "-7/-6" "-6/-5" "-5/-4" "-4/-3" "-3/-2" "-2/-1" "-1/0" "0/1" "1/2" "2/3"
[14] "3/4" "4/5" "5/6" "6/7" "7/8" "8/9" "9/10"
#rename columns
colnames(my_refseq_tss_dinuc_pos) <- relative_tss
#rename row names to the dinucleotides
row.names(my_refseq_tss_dinuc_pos) <- names(tmp)
my_refseq_tss_dinuc_pos
-10/-9 -9/-8 -8/-7 -7/-6 -6/-5 -5/-4 -4/-3 -3/-2 -2/-1 -1/0 0/1 1/2 2/3 3/4 4/5 5/6 6/7 7/8 8/9 9/10
AA 0.044 0.044 0.043 0.041 0.043 0.046 0.048 0.037 0.031 0.030 0.078 0.057 0.055 0.054 0.051 0.045 0.044 0.043 0.048 0.048
AC 0.042 0.043 0.041 0.042 0.036 0.045 0.042 0.041 0.052 0.027 0.071 0.041 0.045 0.046 0.047 0.044 0.039 0.044 0.038 0.043
AG 0.074 0.072 0.068 0.075 0.065 0.063 0.078 0.068 0.048 0.061 0.128 0.064 0.081 0.075 0.077 0.070 0.073 0.071 0.078 0.082
AT 0.025 0.023 0.026 0.027 0.026 0.026 0.028 0.029 0.031 0.018 0.082 0.035 0.036 0.035 0.032 0.034 0.028 0.026 0.025 0.030
CA 0.055 0.057 0.056 0.055 0.052 0.064 0.052 0.049 0.043 0.207 0.033 0.059 0.059 0.058 0.059 0.058 0.063 0.061 0.066 0.058
CC 0.109 0.109 0.107 0.107 0.107 0.101 0.108 0.111 0.160 0.083 0.054 0.063 0.070 0.081 0.098 0.095 0.099 0.104 0.095 0.089
CG 0.076 0.080 0.078 0.079 0.071 0.067 0.083 0.070 0.037 0.113 0.049 0.060 0.067 0.064 0.070 0.088 0.074 0.079 0.080 0.077
CT 0.071 0.066 0.070 0.067 0.077 0.066 0.078 0.073 0.083 0.037 0.061 0.068 0.065 0.063 0.066 0.069 0.067 0.069 0.070 0.066
GA 0.060 0.056 0.063 0.054 0.062 0.064 0.055 0.055 0.043 0.057 0.076 0.074 0.065 0.065 0.061 0.059 0.054 0.062 0.068 0.063
GC 0.100 0.101 0.104 0.104 0.100 0.103 0.092 0.105 0.133 0.047 0.098 0.094 0.089 0.106 0.104 0.099 0.110 0.107 0.100 0.111
GG 0.110 0.115 0.115 0.121 0.121 0.104 0.106 0.113 0.079 0.076 0.111 0.093 0.097 0.092 0.089 0.099 0.112 0.108 0.107 0.110
GT 0.044 0.044 0.045 0.042 0.053 0.043 0.042 0.050 0.053 0.018 0.065 0.051 0.042 0.047 0.040 0.041 0.041 0.046 0.041 0.044
TA 0.023 0.021 0.023 0.021 0.023 0.023 0.020 0.021 0.018 0.067 0.011 0.027 0.031 0.028 0.023 0.023 0.023 0.022 0.022 0.019
TC 0.061 0.059 0.057 0.054 0.055 0.072 0.060 0.066 0.094 0.041 0.026 0.063 0.063 0.060 0.062 0.064 0.064 0.056 0.059 0.058
TG 0.055 0.060 0.059 0.061 0.057 0.062 0.056 0.057 0.033 0.100 0.025 0.075 0.066 0.063 0.062 0.061 0.063 0.058 0.062 0.061
TT 0.050 0.051 0.046 0.051 0.052 0.052 0.051 0.055 0.061 0.020 0.031 0.076 0.070 0.062 0.062 0.051 0.045 0.045 0.042 0.041
#sanity check
colSums(my_refseq_tss_dinuc_pos)
-10/-9 -9/-8 -8/-7 -7/-6 -6/-5 -5/-4 -4/-3 -3/-2 -2/-1 -1/0 0/1 1/2 2/3 3/4 4/5 5/6 6/7
0.999 1.001 1.001 1.001 1.000 1.001 0.999 1.000 0.999 1.002 0.999 1.000 1.001 0.999 1.003 1.000 0.999
7/8 8/9 9/10
1.001 1.001 1.000
The position dinucleotide biases from RefSeqs are comparable to those calculated dinucleotide frequencies around TSSs from CAGE data.
Visualising
We can use the nucleotideFrequencyAt() function to calculate the nucleotide frequencies and to create a sequence logo:
#initialise data frame
my_refseq_tss_nuc_pos <- data.frame()
tmp <- nucleotideFrequencyAt(my_refseq_tss_seq,1)
tmp
A C G T
6193 10469 10563 6358
for (i in 1:21){
my_refseq_tss_nuc_pos[1:4,i] <- nucleotideFrequencyAt(my_refseq_tss_seq,i)
}
row.names(my_refseq_tss_nuc_pos) <- names(tmp)
colnames(my_refseq_tss_nuc_pos) <- c(1:21)
my_refseq_tss_nuc_pos
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
A 6193 6115 5977 6207 5731 6044 6612 5872 5440 4543 12094 6650 7295 7037 6901 6465 6204 6176 6340 6835
C 10469 10483 10443 10347 10286 9999 10769 10162 10867 14755 6621 8376 8749 8954 9851 10427 10173 10483 10438 9763
G 10563 10591 10976 10735 11280 10542 9941 10852 10345 6628 11744 10502 9821 10415 9859 10000 10679 10833 10599 10994
T 6358 6394 6187 6294 6286 6998 6261 6697 6931 7657 3124 8055 7718 7177 6972 6691 6527 6091 6206 5991
21
A 6291
C 10099
G 11106
T 6087
df <- t(my_refseq_tss_nuc_pos)
df
A C G T
1 6193 10469 10563 6358
2 6115 10483 10591 6394
3 5977 10443 10976 6187
4 6207 10347 10735 6294
5 5731 10286 11280 6286
6 6044 9999 10542 6998
7 6612 10769 9941 6261
8 5872 10162 10852 6697
9 5440 10867 10345 6931
10 4543 14755 6628 7657
11 12094 6621 11744 3124
12 6650 8376 10502 8055
13 7295 8749 9821 7718
14 7037 8954 10415 7177
15 6901 9851 9859 6972
16 6465 10427 10000 6691
17 6204 10173 10679 6527
18 6176 10483 10833 6091
19 6340 10438 10599 6206
20 6835 9763 10994 5991
21 6291 10099 11106 6087
#define function that divides the frequency by the row sum i.e. proportions
proportion <- function(x){
rs <- sum(x);
return(x / rs);
}
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("seqLogo")
library(seqLogo)
pwm <- apply(df, 1, proportion)
pwm <- makePWM(pwm)
seqLogo(pwm=pwm, ic.scale=F)
Each column is NOT proportional to its information content, but just the distribution of nucleotides.

This work is licensed under a Creative Commons
Attribution 4.0 International License.
