#!/usr/bin/perl #use strict; # # Program to scan a FASTA-formatted 454 sequence file for usable microsatellites # by Mohamed Noor, May 7, 2008 , modified August 7, 2008 # # Modified by Rich Kliman, April 13, 2009, for any size repeats. Also to # read desired microsat length from keyboard. # print "\n\nProgram to scan a FASTA-formatted 454 sequence file for microsatellites.\n\n\n"; print "Enter filename from which to scan for repeats: "; my $inp_file = <>; print "Enter filename to output: "; my $out_file = <>; print "Enter repeat length (2, 3, 4...): "; my $repeatLgth = <>; chomp $repeatLgth; print "Enter minimum number of repeat units: "; my $repeats = <>; chomp $repeats; print "Enter minimum number bases flanking sequence: "; my $flank = <>; chomp $flank; @base=('A','G','C','T'); # # routine to write @STR array of all possible n-mer microsatellites # @STR=(""); for(my $i=0; $i<$repeatLgth; $i++) { for(my $j=0; $j<4**($i+1); $j+=4) { for(my $k=0; $k<4; $k++) { $STRseq=$STR[$j] . $base[$k]; splice(@STR,$j+$k+1,0,$STRseq); } splice(@STR,$j,1,); } } # end routine to write array of all possible microsats # # routine to remove any shorter microsats from @STR # @divisors=(1); for(my $i=2; $i $out_file"); my $total_seq=0; my $microHits=0; my $keptHits=0; my $line_read = ; if ($line_read !~ /^>/) { print "File does not appear to be FASTA formatted.\n"; print "It does not start with a >\n"; } my $end_tester = 1; while ($end_tester==1) { # # Data entry phase # my $header = $line_read; chomp $header; my $hit = 0; my $sequence = ""; while ($hit==0) { $line_read = ; if (($line_read =~ /^>/) || (eof(SEQDATA))) { $hit = 1; } else { $sequence .= $line_read; chomp $sequence; $sequence =~ s/\s//g; # eliminate extra whitespace } } # end while ($hit==0) $total_seq++; # # Data entered: scan for microsats phase # $foundSTR=0; $keptSTR=0; for(my $i=0; $i<$#STR+1; $i++) { if($sequence =~ /\S{$flank,}($STR[$i]){$repeats,}\S{$flank,}/) { $foundSTR=1; $check=1; $nRep=$repeats; $query=""; for(my $j=0;$j<$nRep;$j++) {$query .= $STR[$i];} while($check) { if(index($sequence,$query,0)==-1) { $check=0; $nRep--; $query=substr($query,0,length($query)-$repeatLgth); } else { $nRep++; $query .= $STR[$i]; } } $up=index($sequence,$query,0); $down=length($sequence)-($up+length($query)); if($up>=$flank && $down>=$flank) { $keptSTR=1; $upSeq=substr($sequence,0,$up); $dnSeq=substr($sequence,length($sequence)-$down,$down); print SEQOUT "$header\n"; print SEQOUT "$sequence\n"; print SEQOUT "STR: $STR[$i]\n"; print SEQOUT "number of repeats: $nRep\n"; print SEQOUT "upstream sequence: $upSeq ($up bases)\n"; print SEQOUT "downstream sequence: $dnSeq ($down bases)\n"; print SEQOUT "**********\n"; } } if($total_seq/10==int($total_seq/10)) {print "finished sequence $total_seq\n";} } if (eof(SEQDATA)) {$end_tester=0;} if($foundSTR==1) {$microHits++;} if($keptSTR==1) {$keptHits++;} } # end while ($end_tester==1) print "\nTotal number of scanned sequences: $total_seq\n"; print "Total number of sequences with microsatellites: $microHits\n"; print "Total number of sequences with microsats and flanking sequence: $keptHits\n\n\n"; close SEQDATA; close SEQOUT; exit;