After mapping your reads from an RNA-Seq experiment, usually the next task is identify the transcripts that the reads came from (i.e. annotating RNA-Seq data) and there are many ways of doing so. Here I just describe a rather crude method whereby I download sequence coordinates of hg19 RefSeqs as a BED12 file from the UCSC Genome Browser Table Browser and extrapolate the 5′ UTR, exons, introns, 3′ UTR and intergenic regions according to this file. I then output this as a bed file for use with intersectBed. A better (and much less error prone) method would be just to download each individual region of a RefSeq transcript from the Table Browser.
The process would have been quite straight forward, however since transcripts are located on the positive and negative strands and are spliced (even 5′ and 3′ UTRs), the script I wrote to create the annotation bed file is quite long and may contain unwanted features i.e. bugs. To use the script first download the refGene BED12 file from the UCSC Genome Browser Table Browser. Then sort the file by chromosome and then by the starting coordinate i.e. cat refgene.bed12 | sort -k1,1 -k2,2n > refgene_sorted.bed12. NOTE: this script will only work for the RefSeqs mapped to hg19, since I have hardcoded the chromosome ends within the script.
Without further ado:
#!/usr/bin/perl use strict; use warnings; my $usage = "Usage: $0 <infile.bed>\n"; my $infile = shift or die $usage; my %chr_end = (); #DATA used for calculating final intergenic region while(<DATA>){ chomp; my ($chr,$end) = split(/\s+/); $chr_end{$chr} = $end; } close(DATA); #used for calculating intergenic regions my $end_of_transcript = '-1'; my $previous_chr = 'chr1'; open(IN,'<',$infile) || die "Could not open $infile: $!\n"; while(<IN>){ chomp; =head1 sample bed file chr1 11873 14409 NR_046018 0 + 14409 14409 0 3 354,109,1189, 0,739,1347, chr1 14361 29370 NR_024540 0 - 29370 29370 0 11 468,69,152,159,198,136,137,147,99,154,50, 0,608,1434,2245,2496,2871,3244,3553,3906,10376,14959, chr1 34610 36081 NR_026818 0 - 36081 36081 0 3 564,205,361, 0,666,1110, =cut my($chr,$start,$end,$id,$score,$strand,$thick_start,$thick_end,$rgb,$block_count,$block_size,$block_start) = split(/\t/); #just in case $chr= lc $chr; #work only with the assembled chromosomes next unless exists $chr_end{$chr}; my @block_size = split(/,/,$block_size); my @block_start = split(/,/,$block_start); #declare UTR variables my $five_prime_utr_start = '0'; my $five_prime_utr_end = '0'; my $three_prime_utr_start = '0'; my $three_prime_utr_end = '0'; #utr information is present when the thick_start is not equal to the start and end if ($end != $thick_start && $thick_start != $start ){ #chr1 861120 879961 NM_152486 0 + 861321 879533 my $utr_start = $start; my $utr_end = $thick_start + 1; if ($strand eq '+'){ $five_prime_utr_start = $utr_start; $five_prime_utr_end = $utr_end; } else { $three_prime_utr_start =$utr_end; $three_prime_utr_end =$utr_start; } } if ($end != $thick_end){ my $utr_start = $thick_end; my $utr_end = $end + 1; if ($strand eq '+'){ $three_prime_utr_start = $utr_start; $three_prime_utr_end = $utr_end; } else { $five_prime_utr_start =$utr_end; $five_prime_utr_end =$utr_start; } } #declare arrays for introns and exons my @exon_start = (); my @exon_end = (); my @intron_start = (); my @intron_end = (); #go through each block from right to left for (my $block=$block_count-1; $block>=0; --$block){ #exons #0 based coordinates my $exon_start = $start + $block_start[$block] + 1; my $exon_end = $start + $block_start[$block] + $block_size[$block]; #print "Exon: $block\t$exon_start\t$exon_end\n"; if ($strand eq '+'){ if ($three_prime_utr_start != 0){ if ($exon_start > $three_prime_utr_start){ #print "3' UTR is spliced:\t$exon_start\t$exon_end\n"; print join("\t",$chr,$exon_start,$exon_end,"3_UTR_$id",'0',$strand),"\n"; } else { #print "3' UTR:\t$three_prime_utr_start\t$exon_end\n"; print join("\t",$chr,$three_prime_utr_start,$exon_end,"3_UTR_$id",'0',$strand),"\n"; $three_prime_utr_start = '0'; } } } else { if ($five_prime_utr_end != 0){ if ($exon_start > $five_prime_utr_end){ #print "5' UTR is spliced:\t$exon_start\t$exon_end\n"; print join("\t",$chr,$exon_start,$exon_end,"5_UTR_$id",'0',$strand),"\n"; } else { #print "5' UTR:\t$five_prime_utr_end\t$exon_end\n"; print join("\t",$chr,$five_prime_utr_end,$exon_end,"5_UTR_$id",'0',$strand),"\n"; $five_prime_utr_end = '0'; } } } } my $previous_exon_end = ''; #go through each block from left to right for (1 .. $block_count){ #array indexing starts with 0 my $block = $_ - 1; #exons #0 based coordinates my $exon_start = $start + $block_start[$block] + 1; my $exon_end = $start + $block_start[$block] + $block_size[$block]; #print "$id\t$exon_start\t$exon_end\n"; print join("\t",$chr,$exon_start,$exon_end,"EXON_$id",'0',$strand),"\n"; $exon_start[$block] = $exon_start; $exon_end[$block] = $exon_end; #print "Exon $block:\t$exon_start\t$exon_end\n"; if ($strand eq '+'){ if ($five_prime_utr_end != 0){ if ($exon_end < $five_prime_utr_end){ #print "5' UTR is spliced:\t$exon_start\t$exon_end\n"; print join("\t",$chr,$exon_start,$exon_end,"5_UTR_$id",'0',$strand),"\n"; } else { #print "5' UTR:\t$exon_start\t$five_prime_utr_end\n"; print join("\t",$chr,$exon_start,$five_prime_utr_end,"5_UTR_$id",'0',$strand),"\n"; $five_prime_utr_end = '0'; } } } else { if ($three_prime_utr_start != 0){ if ($exon_end < $three_prime_utr_start){ #print "3' UTR is spliced:\t$exon_start\t$exon_end\n"; print join("\t",$chr,$exon_start,$exon_end,"3_UTR_$id",'0',$strand),"\n"; } else { #print "3' UTR:\t$exon_start\t$three_prime_utr_start\n"; print join("\t",$chr,$exon_start,$three_prime_utr_start,"3_UTR_$id",'0',$strand),"\n"; $three_prime_utr_start = '0'; } } } #introns if ($block == 0){ $previous_exon_end = $exon_end; } else { my $intron_start = $previous_exon_end + 1; my $intron_end = $exon_start - 1; #print "Intron $block:\t$intron_start\t$intron_end\n"; print join("\t",$chr,$intron_start,$intron_end,"INTRON_$id",'0',$strand),"\n"; $intron_start[$block] = $intron_start; $intron_end[$block] = $intron_end; $previous_exon_end = $exon_end; } } #first and last exons #uncomment out if you want to store first and last exons if ($block_count > 1){ if ($strand eq '+'){ my $first_exon_start = $exon_start[0]; my $first_exon_end = $exon_end[0]; my $last_exon_start = $exon_start[-1]; my $last_exon_end = $exon_end[-1]; #print "First exon:\t$first_exon_start\t$first_exon_end\n"; #print "Last exon:\t$last_exon_start\t$last_exon_end\n"; } elsif ($strand eq '-'){ my $first_exon_start = $exon_start[-1]; my $first_exon_end = $exon_end[-1]; my $last_exon_start = $exon_start[0]; my $last_exon_end = $exon_end[0]; #print "First exon:\t$first_exon_start\t$first_exon_end\n"; #print "Last exon:\t$last_exon_start\t$last_exon_end\n"; } else { die "Incorrect strand information: $strand\n"; } } #intergenic #the file is sorted, so this is the last transcript on the chromosome if ($chr ne $previous_chr){ my $intergenic_start = $end_of_transcript + 1; my $intergenic_end = $chr_end{$previous_chr} - 1; $end_of_transcript = '-1'; #print "$intergenic_start\t$intergenic_end\n"; print join("\t",$chr,$intergenic_start,$intergenic_end,'INTERGENIC','0','0'),"\n"; $previous_chr = $chr; } my $intergenic_start = $end_of_transcript + 1; my $intergenic_end = $start - 1; #print "$intergenic_start\t$intergenic_end\n"; #this is to avoid overlapping transcript if ($intergenic_start >= $intergenic_end){ #overlapping transcript } else { print join("\t",$chr,$intergenic_start,$intergenic_end,'INTERGENIC','0','0'),"\n"; } $end_of_transcript = $end; #print "---\n\n"; } close(IN); exit(0); #hg19 chromosome ends used for calculating last intergenic region __DATA__ chr1 249250621 chr2 243199373 chr3 198022430 chr4 191154276 chr5 180915260 chr6 171115067 chr7 159138663 chrX 155270560 chr8 146364022 chr9 141213431 chr10 135534747 chr11 135006516 chr12 133851895 chr13 115169878 chr14 107349540 chr15 102531392 chr16 90354753 chr17 81195210 chr18 78077248 chr20 63025520 chrY 59373566 chr19 59128983 chr22 51304566 chr21 48129895
You should end up with a bed file as such:
chr1 1264277 1266724 INTERGENIC 0 0
chr1 1284445 1284492 5_UTR_NM_004421 0 –
chr1 1270658 1271895 EXON_NM_004421 0 –
chr1 1270658 1271522 3_UTR_NM_004421 0 –
chr1 1273357 1273563 EXON_NM_004421 0 –
chr1 1271896 1273356 INTRON_NM_004421 0 –
chr1 1273649 1273816 EXON_NM_004421 0 –
chr1 1273564 1273648 INTRON_NM_004421 0 –

This work is licensed under a Creative Commons
Attribution 4.0 International License.