Split the genome into 6 bp windows and calculate the 6 mer frequencies.
#!/usr/bin/perl use strict; use warnings; my $usage = "Usage $0 <infile.fa>\n"; my $infile = shift or die $usage; my $seq = ''; open(IN,'<',$infile) || die "Could not open $infile: $!\n"; while(<IN>){ chomp; next if (/^>/); $seq .= $_; } close(IN); my %six_mer = (); my $length = length($seq); foreach my $frame (0 .. 5){ my $frame_length = $length - $frame; #print "Frame length: $frame_length\n"; my $mod = $frame_length % 6; #print "Mod: $mod\n"; $frame_length -= $mod; #print "Frame length: $frame_length\n"; for (my $i = $frame; $frame<$frame_length; $frame=$frame+6){ my $mer = substr($seq,$frame,6); #print "$mer\n"; if (exists $six_mer{$mer}){ $six_mer{$mer}++; } else { $six_mer{$mer} = '1'; } } } print "$infile\n"; foreach my $mer (sort {$six_mer{$b} <=> $six_mer{$a} } keys %six_mer){ print "$mer: $six_mer{$mer}\n"; } print "\n"; exit(0);
Scanning chr6 of hg19:
NNNNNN: 3719950
aaaaaa: 373380
tttttt: 372667
TTTTTT: 184768
AAAAAA: 182652
aaaaat: 143055
attttt: 142671
ATTTTT: 133646
TTTAAA: 133284
AAAAAT: 133130
TATTTT: 130672
AAAATA: 129572
TTTTAA: 129570
TTAAAA: 129177
aaaata: 123528
tatttt: 123134
atatat: 119872
...

This work is licensed under a Creative Commons
Attribution 4.0 International License.