Split the genome into 6 bp windows and calculate the 6 mer frequencies.
#!/usr/bin/perl
use strict;
use warnings;
my $usage = "Usage $0 <infile.fa>\n";
my $infile = shift or die $usage;
my $seq = '';
open(IN,'<',$infile) || die "Could not open $infile: $!\n";
while(<IN>){
chomp;
next if (/^>/);
$seq .= $_;
}
close(IN);
my %six_mer = ();
my $length = length($seq);
foreach my $frame (0 .. 5){
my $frame_length = $length - $frame;
#print "Frame length: $frame_length\n";
my $mod = $frame_length % 6;
#print "Mod: $mod\n";
$frame_length -= $mod;
#print "Frame length: $frame_length\n";
for (my $i = $frame; $frame<$frame_length; $frame=$frame+6){
my $mer = substr($seq,$frame,6);
#print "$mer\n";
if (exists $six_mer{$mer}){
$six_mer{$mer}++;
} else {
$six_mer{$mer} = '1';
}
}
}
print "$infile\n";
foreach my $mer (sort {$six_mer{$b} <=> $six_mer{$a} } keys %six_mer){
print "$mer: $six_mer{$mer}\n";
}
print "\n";
exit(0);
Scanning chr6 of hg19:
NNNNNN: 3719950
aaaaaa: 373380
tttttt: 372667
TTTTTT: 184768
AAAAAA: 182652
aaaaat: 143055
attttt: 142671
ATTTTT: 133646
TTTAAA: 133284
AAAAAT: 133130
TATTTT: 130672
AAAATA: 129572
TTTTAA: 129570
TTAAAA: 129177
aaaata: 123528
tatttt: 123134
atatat: 119872
...

This work is licensed under a Creative Commons
Attribution 4.0 International License.