#!/usr/bin/perl
use Bio::Seq;
use Bio::SeqIO;
use Getopt::Long;
use FindBin qw($Bin);
use strict;
use warnings;
if(@ARGV < 1) { die "fasta-file codon_table codon_file\n"; }
&main;
####################################################################################################
#main program
sub main {
my($seqio);
my($line) = "";
my($seq) = "";
my($revcomp) = "";
my($head) = "null";
my($len) = "";
my($frame_seq) = "";
my($codonfile) = "";
my($codon) = "";
my($codontab) = 11;
my(%frame) = ();
my(%result) = ();
my(@temptable) = ();
my(%codontable) = ();
my(%results) = ();
my $L= 50;
GetOptions('L=i' => \$L);
####################################################################################################
#read the given codon table from file
if(defined($ARGV[1])) {
$codontab = $ARGV[1];
}
if(defined($ARGV[2])) {
$codonfile = $ARGV[2];
} else {
$codonfile = "$Bin/codon_tables_ncbi.txt";
}
if(-e $codonfile) {
open(TABLE, $codonfile) or die "$codonfile not opened\n";
while($line =
) {
if($line =~ m/\#$codontab\./) {
for(my($i) = 0; $i < 7; ++$i) {
$line = ;
$line =~ s/^.*=\s*//g;
$line =~ s/\s+$//g;
if(length($line) > 0) {
#print "$line\n";
push(@temptable, $line);
}
}
}
}
close TABLE;
if(scalar(@temptable) > 0) {
for(my($i) = 0; $i < length($temptable[0]); ++$i) {
$codon = substr($temptable[2], $i, 1)."".substr($temptable[3], $i, 1)."".substr($temptable[4], $i, 1);
$codontable{$codon}{'aa'} = substr($temptable[0], $i, 1);
$codontable{$codon}{'start'} = substr($temptable[1], $i, 1);
#print "$codon\t".substr($temptable[1], $i, 1)."\t".substr($temptable[0], $i, 1)."\n";
}
}
} else {
die "$codonfile not found\n";
}
####################################################################################################
#read sequences
if(-e $ARGV[0]) {
$seqio = Bio::SeqIO->new( '-format' => 'fasta' , -file => $ARGV[0]);
} else {
die "$ARGV[0] not found\n";
}
while ( my $seqobj = $seqio->next_seq ) {
$len = $seqobj->length();
$head = $seqobj->id();
$seq = uc($seqobj->seq());
$revcomp = reverse($seq);
$revcomp =~ tr/ACGT/TGCA/;
%results = ();
for(my($i) = 0; $i < 3; ++$i) {
$frame_seq = substr($seq, $i);
&translate($frame_seq, \%codontable, \%result, $head, ($i+1), \%results, $L);
$frame_seq = substr($revcomp, $i);
&translate($frame_seq, \%codontable, \%result, $head, (($i+1)*-1), \%results, $L);
}
for my $key (sort { $a <=> $b; } (keys(%results))) {
print "$results{$key}{'header'}";
print "$results{$key}{'seq'}";
}
}
####################################################################################################
};
####################################################################################################
#finds always the longest AA-sequence
sub translate {
my($seq, $conv_table, $result, $name, $frame, $res, $L) = @_;
my($piece) = "";
my($aa_seq) = "";
my($last) = "";
my($orf) = "";
my($pos) = 0;
my($end_char) = "";
my($start_end_string) = "";
my($len) = length($seq);
##################################################################################################
#convert to aa (for Emmanuel, start codons must be stored into another hash as otherwise
#all start are assumed to encode M, that is not true
$aa_seq = "X";
$start_end_string = "M";
for(my($i) = 0; $i < length($seq); $i = $i + 3) {
$piece = substr($seq, $i, 3);
if(length($piece) == 3 ) {
if(exists(${$conv_table}{$piece})) {
$aa_seq .= ${$conv_table}{$piece}{'aa'};
if((${$conv_table}{$piece}{'start'} eq 'M')) {
$start_end_string .= "M";
} elsif(${$conv_table}{$piece}{'aa'} eq '*') {
$start_end_string .= "*";
} else {
$start_end_string .= "-";
}
} else {
$aa_seq .= "X";
$start_end_string .= "-";
}
}
}
$aa_seq .= "*";
$start_end_string .= "*";
###################################################################################################
#get all orf longer than $L aa (gives the longest possible orf-always!!!)
#print "$aa_seq\n\n";
#print "$start_end_string\n\n";
my($temp_aa) = "";
my($start_map) = "";
my($start_coord) = "";
my($alt_start) = "";
my($sub_seq) = 0;
my($namecounter) = 0;
while($start_end_string =~ m/M.*?\*/g) {
if(length($&) >= $L) {
$start_map = $&;
if($frame <= 0) {
$start_coord = ($len+abs($frame)) - ((pos($start_end_string)*3)+abs($frame)) + 4 #extra codon;
} else {
$start_coord = (pos($start_end_string)*3)-(length($&)*3)+abs($frame) - 3;
}
$temp_aa = substr($aa_seq, pos($start_end_string)-length($&), length($&));
$temp_aa =~ s/^./M/;
$temp_aa =~ s/.{60}/$&\n/g;
$temp_aa =~ s/^\s+|\s+$//g;
if(!(exists(${$res}{"$start_coord.0"}))) {
${$res}{"$start_coord.0"}{'header'} = ">$name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)." X ".length($aa_seq)."\n";
${$res}{"$start_coord.0"}{'seq'} = "$temp_aa\n";
#print "NON: $name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n";
} else {
$namecounter = 0;
while((exists(${$res}{$start_coord.".".$namecounter}))) {
++$namecounter;
}
${$res}{$start_coord.".".$namecounter}{'header'} = ">$name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n";
${$res}{$start_coord.".".$namecounter}{'seq'} = "$temp_aa\n";
#print STDERR "DUBLICATED: $namecounter $name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n";
}
}
}
};
#####################################################################################################
#