#!/bin/csh
# mrtrans.csh                  September  1,  2010
# This is a front end for mrtrans. It makes sure that the names of
# the sequences in PROTFILE and DNAFILE are the same, and re-orders
# the sequences in DNAFILE, if necessary, to be in the same order
# as in PROTFILE.
# This script assumes that sequence names in PROTFILE are IDENTICAL to
# the corresponding names in DNAFILE.

set PROTFILE = $1
set DNAFILE = $2
set JOBID = $$

# RM_CMD - command to be used for removing files and directories
if (-e /usr/bin/rm) then
   set RM_CMD = /usr/bin/rm
else
   if (-e /bin/rm) then
      set RM_CMD = /bin/rm
   else
      set RM_CMD = rm
   endif
endif

# ---------------- Reformat files for use by mrtrans binaries ----------------------------------
# Modified for compatability with both GDE and biolegato. bioLegato
# can't read pseudo-Genbank files created by readseq, so output now
# goes to a GDE flat file, which both can read.
# sed edits out ":CDS" or "_CDS" that is added by FEATURES
# tr gets rid of extra > and ASCII 128 added by mrtrans
# sed lops off the  comma and extra junk readseq adds to the name line
# including the " from >name..." that needs to be removed.
# mrtrans has been modified to allow sequence names longer than
# 9 char. Install mrtrans from mrtrans.tar.
# Finally, we have to translate the protein sequence to 
# uppercase, which is required by mrtrans.
readseq -a -f=Pearson -C -pipe $PROTFILE |sed "s/[:_]CDS/_/" | sed "s/,.*//" >$JOBID.prot; 
readseq -a -f=Pearson -pipe $DNAFILE |sed "s/[:_]CDS/_/" |sed "s/,.*//" | cut -f1 -d" " > $JOBID.dna; 


# ----------------- Make sure names are consistent between DNA and Protein files -------------------
#Make a list of sequence names from each file.
grep '>' $JOBID.prot | cut -c2-80 | cut -f1 -d" " > $JOBID.pro.nam
grep '>' $JOBID.dna  | cut -c2-80 | cut -f1 -d" " > $JOBID.dna.nam

echo Making sure all names in protein file have a counterpart in dna file...

sort < $JOBID.pro.nam > $JOBID.pro.nam.sorted
sort < $JOBID.dna.nam > $JOBID.dna.nam.sorted
comm -23 $JOBID.pro.nam.sorted $JOBID.dna.nam.sorted > $JOBID.missing

if (-z $JOBID.missing) then
  # All protein sequences have a DNA counterpart. 
  # Re-order the DNA file into the same order as the protein file.
  # A $JOBID.dna.num is the same as $JOBID.dna.nam, except that
  # the former begins with a line number for each name in the 
  # latter. For each protein sequence, the appropriate line
  # number is chosen using grep, and readseq pulls out that
  # sequence from $DNAFILE.
  echo Re-ordering the DNA file to correspond to order of sequences
  echo in protein file.
  cat -n $JOBID.dna.nam |tr -s " " " " > $JOBID.dna.num
  foreach name (`cat $JOBID.pro.nam`)
     set LINENUM = `grep -w "$name" $JOBID.dna.num | cut -f1 -d"	" |tr -d ' ' `
     readseq -pipe -C -i$LINENUM -fPearson $JOBID.dna >> $JOBID.dna.reordered
  end  
else
  echo 'The following sequences are present in the protein file'
  echo 'but missing from the DNA file:'
  cat $JOBID.missing
endif

# ------------------------ Run mrtrans binary and cleanup output so bioLegato can read it
mrtrans $JOBID.prot $JOBID.dna.reordered | sed "s/> from >/>/" |tr -d '' | readseq -pipe -a -f8 | sed "s/>>/#/"|sed 's/ from.*//' > $JOBID.flat;


# Launch bioLegato in the background
(bldna $JOBID.flat;  $RM_CMD $JOBID.*)& 


# Delete temporary files
#$RM_CMD $JOBID.prot $JOBID.dna* $JOBID.*.nam $JOBID.*.sorted $JOBID.missing
