After mapping your reads from an RNA-Seq experiment, usually the next task is identify the transcripts that the reads came from (i.e. annotating RNA-Seq data) and there are many ways of doing so. Here I just describe a rather crude method whereby I download sequence coordinates of hg19 RefSeqs as a BED12 file from the UCSC Genome Browser Table Browser and extrapolate the 5' UTR, exons, introns, 3' UTR and intergenic regions according to this file. I then output this as a bed file for use with intersectBed. A better (and much less error prone) method would be just to download each individual region of a RefSeq transcript from the Table Browser.
The process would have been quite straight forward, however since transcripts are located on the positive and negative strands and are spliced (even 5' and 3' UTRs), the script I wrote to create the annotation bed file is quite long and may contain unwanted features i.e. bugs. To use the script first download the refGene BED12 file from the UCSC Genome Browser Table Browser. Then sort the file by chromosome and then by the starting coordinate i.e. cat refgene.bed12 | sort -k1,1 -k2,2n > refgene_sorted.bed12. NOTE: this script will only work for the RefSeqs mapped to hg19, since I have hardcoded the chromosome ends within the script.
Without further ado:
#!/usr/bin/perl
use strict;
use warnings;
my $usage = "Usage: $0 <infile.bed>\n";
my $infile = shift or die $usage;
my %chr_end = ();
#DATA used for calculating final intergenic region
while(<DATA>){
chomp;
my ($chr,$end) = split(/\s+/);
$chr_end{$chr} = $end;
}
close(DATA);
#used for calculating intergenic regions
my $end_of_transcript = '-1';
my $previous_chr = 'chr1';
open(IN,'<',$infile) || die "Could not open $infile: $!\n";
while(<IN>){
chomp;
=head1 sample bed file
chr1 11873 14409 NR_046018 0 + 14409 14409 0 3 354,109,1189, 0,739,1347,
chr1 14361 29370 NR_024540 0 - 29370 29370 0 11 468,69,152,159,198,136,137,147,99,154,50, 0,608,1434,2245,2496,2871,3244,3553,3906,10376,14959,
chr1 34610 36081 NR_026818 0 - 36081 36081 0 3 564,205,361, 0,666,1110,
=cut
my($chr,$start,$end,$id,$score,$strand,$thick_start,$thick_end,$rgb,$block_count,$block_size,$block_start) = split(/\t/);
#just in case
$chr= lc $chr;
#work only with the assembled chromosomes
next unless exists $chr_end{$chr};
my @block_size = split(/,/,$block_size);
my @block_start = split(/,/,$block_start);
#declare UTR variables
my $five_prime_utr_start = '0';
my $five_prime_utr_end = '0';
my $three_prime_utr_start = '0';
my $three_prime_utr_end = '0';
#utr information is present when the thick_start is not equal to the start and end
if ($end != $thick_start && $thick_start != $start ){
#chr1 861120 879961 NM_152486 0 + 861321 879533
my $utr_start = $start;
my $utr_end = $thick_start + 1;
if ($strand eq '+'){
$five_prime_utr_start = $utr_start;
$five_prime_utr_end = $utr_end;
} else {
$three_prime_utr_start =$utr_end;
$three_prime_utr_end =$utr_start;
}
}
if ($end != $thick_end){
my $utr_start = $thick_end;
my $utr_end = $end + 1;
if ($strand eq '+'){
$three_prime_utr_start = $utr_start;
$three_prime_utr_end = $utr_end;
} else {
$five_prime_utr_start =$utr_end;
$five_prime_utr_end =$utr_start;
}
}
#declare arrays for introns and exons
my @exon_start = ();
my @exon_end = ();
my @intron_start = ();
my @intron_end = ();
#go through each block from right to left
for (my $block=$block_count-1; $block>=0; --$block){
#exons
#0 based coordinates
my $exon_start = $start + $block_start[$block] + 1;
my $exon_end = $start + $block_start[$block] + $block_size[$block];
#print "Exon: $block\t$exon_start\t$exon_end\n";
if ($strand eq '+'){
if ($three_prime_utr_start != 0){
if ($exon_start > $three_prime_utr_start){
#print "3' UTR is spliced:\t$exon_start\t$exon_end\n";
print join("\t",$chr,$exon_start,$exon_end,"3_UTR_$id",'0',$strand),"\n";
} else {
#print "3' UTR:\t$three_prime_utr_start\t$exon_end\n";
print join("\t",$chr,$three_prime_utr_start,$exon_end,"3_UTR_$id",'0',$strand),"\n";
$three_prime_utr_start = '0';
}
}
} else {
if ($five_prime_utr_end != 0){
if ($exon_start > $five_prime_utr_end){
#print "5' UTR is spliced:\t$exon_start\t$exon_end\n";
print join("\t",$chr,$exon_start,$exon_end,"5_UTR_$id",'0',$strand),"\n";
} else {
#print "5' UTR:\t$five_prime_utr_end\t$exon_end\n";
print join("\t",$chr,$five_prime_utr_end,$exon_end,"5_UTR_$id",'0',$strand),"\n";
$five_prime_utr_end = '0';
}
}
}
}
my $previous_exon_end = '';
#go through each block from left to right
for (1 .. $block_count){
#array indexing starts with 0
my $block = $_ - 1;
#exons
#0 based coordinates
my $exon_start = $start + $block_start[$block] + 1;
my $exon_end = $start + $block_start[$block] + $block_size[$block];
#print "$id\t$exon_start\t$exon_end\n";
print join("\t",$chr,$exon_start,$exon_end,"EXON_$id",'0',$strand),"\n";
$exon_start[$block] = $exon_start;
$exon_end[$block] = $exon_end;
#print "Exon $block:\t$exon_start\t$exon_end\n";
if ($strand eq '+'){
if ($five_prime_utr_end != 0){
if ($exon_end < $five_prime_utr_end){
#print "5' UTR is spliced:\t$exon_start\t$exon_end\n";
print join("\t",$chr,$exon_start,$exon_end,"5_UTR_$id",'0',$strand),"\n";
} else {
#print "5' UTR:\t$exon_start\t$five_prime_utr_end\n";
print join("\t",$chr,$exon_start,$five_prime_utr_end,"5_UTR_$id",'0',$strand),"\n";
$five_prime_utr_end = '0';
}
}
} else {
if ($three_prime_utr_start != 0){
if ($exon_end < $three_prime_utr_start){
#print "3' UTR is spliced:\t$exon_start\t$exon_end\n";
print join("\t",$chr,$exon_start,$exon_end,"3_UTR_$id",'0',$strand),"\n";
} else {
#print "3' UTR:\t$exon_start\t$three_prime_utr_start\n";
print join("\t",$chr,$exon_start,$three_prime_utr_start,"3_UTR_$id",'0',$strand),"\n";
$three_prime_utr_start = '0';
}
}
}
#introns
if ($block == 0){
$previous_exon_end = $exon_end;
} else {
my $intron_start = $previous_exon_end + 1;
my $intron_end = $exon_start - 1;
#print "Intron $block:\t$intron_start\t$intron_end\n";
print join("\t",$chr,$intron_start,$intron_end,"INTRON_$id",'0',$strand),"\n";
$intron_start[$block] = $intron_start;
$intron_end[$block] = $intron_end;
$previous_exon_end = $exon_end;
}
}
#first and last exons
#uncomment out if you want to store first and last exons
if ($block_count > 1){
if ($strand eq '+'){
my $first_exon_start = $exon_start[0];
my $first_exon_end = $exon_end[0];
my $last_exon_start = $exon_start[-1];
my $last_exon_end = $exon_end[-1];
#print "First exon:\t$first_exon_start\t$first_exon_end\n";
#print "Last exon:\t$last_exon_start\t$last_exon_end\n";
} elsif ($strand eq '-'){
my $first_exon_start = $exon_start[-1];
my $first_exon_end = $exon_end[-1];
my $last_exon_start = $exon_start[0];
my $last_exon_end = $exon_end[0];
#print "First exon:\t$first_exon_start\t$first_exon_end\n";
#print "Last exon:\t$last_exon_start\t$last_exon_end\n";
} else {
die "Incorrect strand information: $strand\n";
}
}
#intergenic
#the file is sorted, so this is the last transcript on the chromosome
if ($chr ne $previous_chr){
my $intergenic_start = $end_of_transcript + 1;
my $intergenic_end = $chr_end{$previous_chr} - 1;
$end_of_transcript = '-1';
#print "$intergenic_start\t$intergenic_end\n";
print join("\t",$chr,$intergenic_start,$intergenic_end,'INTERGENIC','0','0'),"\n";
$previous_chr = $chr;
}
my $intergenic_start = $end_of_transcript + 1;
my $intergenic_end = $start - 1;
#print "$intergenic_start\t$intergenic_end\n";
#this is to avoid overlapping transcript
if ($intergenic_start >= $intergenic_end){
#overlapping transcript
} else {
print join("\t",$chr,$intergenic_start,$intergenic_end,'INTERGENIC','0','0'),"\n";
}
$end_of_transcript = $end;
#print "---\n\n";
}
close(IN);
exit(0);
#hg19 chromosome ends used for calculating last intergenic region
__DATA__
chr1 249250621
chr2 243199373
chr3 198022430
chr4 191154276
chr5 180915260
chr6 171115067
chr7 159138663
chrX 155270560
chr8 146364022
chr9 141213431
chr10 135534747
chr11 135006516
chr12 133851895
chr13 115169878
chr14 107349540
chr15 102531392
chr16 90354753
chr17 81195210
chr18 78077248
chr20 63025520
chrY 59373566
chr19 59128983
chr22 51304566
chr21 48129895
You should end up with a bed file as such:
chr1 1264277 1266724 INTERGENIC 0 0
chr1 1284445 1284492 5_UTR_NM_004421 0 -
chr1 1270658 1271895 EXON_NM_004421 0 -
chr1 1270658 1271522 3_UTR_NM_004421 0 -
chr1 1273357 1273563 EXON_NM_004421 0 -
chr1 1271896 1273356 INTRON_NM_004421 0 -
chr1 1273649 1273816 EXON_NM_004421 0 -
chr1 1273564 1273648 INTRON_NM_004421 0 -

This work is licensed under a Creative Commons
Attribution 4.0 International License.