#!/usr/local/bin/python
# April 28, 2005, Dr. Brian Fristensky, University of Manitoba

# Description: Given a file of GI numbers, return sequences in
# GenBank or Fasta format. We use SHoundGetGenBankff or SHoundGetFasta,
# rather than the SHoundGetGenBankffList or SHoundGetFastaffList. These
# List methods are unreliable for very large sequences. If even
# one sequence can't be returned, nothing is returned. Instead
# we retrieve one sequence at a time, so that the impact of
# a failure is minimized.

# Synopsis: SHGetSeq.py gifile -f outputformat outfile

# options: infile    GDE flat file, with name on line 1, followed by
#                    comma separated list of tokens GI numbers
#          -f        Either  'fasta' for fasta or 'genbank' for GenBank
#        outfile     GenBank or FASTA file

import sys
import string
import os

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters :
      "Wrapper class for command line parameters"
      def __init__(self) :
          self.IFN = ""
	  self.FORMAT = "fasta"
	  self.METHOD = "SHoundGetFasta"
	  self.OFN = ""

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def READARGS(P) :
    "Read command line arguments into a Parameter object"
    NUMARGS = len(sys.argv)
    if NUMARGS >= 1 :
       P.IFN = sys.argv[1]

    I = 2
    while (I < NUMARGS)  :
	  if sys.argv[I] == "-f" :
             I = I + 1
             if I < NUMARGS :
	        TEMP = sys.argv[I]
		if (TEMP in ['fasta','genbank']) :
        	   P.FORMAT = TEMP
		I = I + 1
	  else :
             P.OFN = sys.argv[I]
             I = NUMARGS	             	
    return
	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class FILE :
      "Wrapper class for files"
     
      def __init__(self,FILENAME,MODE) :
          self.FN = FILENAME
	  self.F = open(FILENAME,MODE)
	  self.LINE = "" # most recent line read

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class IDLST :
      "Wrapper class for ID lists"
     
      def __init__(self) :
          self.NAME = ""
	  self.LST = []
	  	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in old and new strings, striping 
# leading and trailing whitespace, including
# newline characters.
def GETGDELIST(INFILE,NAMEFLAG,GILST) :

    # Read name line
    while (INFILE.LINE != "" and GILST.NAME == "") :
          INFILE.LINE = INFILE.LINE.strip()
	  if len(INFILE.LINE) > 0 :
             if INFILE.LINE[0] == NAMEFLAG :
                GILST.NAME = INFILE.LINE[1:]
          INFILE.LINE = INFILE.F.readline()	  
    
    # Read GI list 
    GILST.LST = []   
    if GILST.NAME != "" :
       # GDE wraps the flat file with newlines every 60
       # characters. 
       # Next, we have to delete the newlines to turn the entire
       # file into a single long string called BIGLINE
       BIGLINE = ""  
       DONE =  0     
       while (INFILE.LINE != "" and DONE ==0) :
             TMPLINE = INFILE.LINE.strip()
	     if len(TMPLINE) > 0 :
	        if TMPLINE[0] == NAMEFLAG :
		   DONE = 1 
	        else:
		   BIGLINE = BIGLINE + TMPLINE
	           INFILE.LINE = INFILE.F.readline()

       # parse the string as a comma separated list
       GILST.LST = BIGLINE.split(',')

    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Retrieve the sequences and write them to outfile
def GETSEQS(GILST,METHOD,OFN) :


    # Create a temporary filename
    TFN = 'SHGetSeq.' + str(os.getpid())
    
    LEN = len(GILST.LST)
    if  LEN > 0 :
       for i in range(0,LEN) :
           MPI = str(GILST.LST[i])
           COMMAND = 'leash -mn ' + METHOD + ' -mpi ' + MPI + ' -of ' + TFN           
           os.system(COMMAND)

           OKAY = False
           if os.path.exists(TFN) :
	      if os.path.getsize(TFN) > 0 :
	         OKAY = True
        	 # Append the contents of the temporary file to the output file
		 COMMAND = 'cat ' + TFN + '>> ' + OFN
		 os.system(COMMAND)
		 os.remove(TFN)	          
    return 
    
  
#======================== MAIN PROCEDURE ==========================
P = Parameters ()

READARGS(P)

#---------- Set global constants
NAMEFLAG = '"'  # 1st character on the name line, indicating
                # the beginning of the next data list
		
MAXSEQ = 1000000 # maximum sequence length that SeqHound can retrieve

# Default is SHoundGetFastaList
if (P.FORMAT == 'genbank') :
   P.METHOD = 'SHoundGetGenBankff'
   
INFILE = FILE(P.IFN,'r')

# ------------------------- MAIN LOOP -----------------------
# GDE flatfile may contain 0 or more lists, so we iterate
# for each list.
# Note that GETGDELIST takes care of reading in the next
# input line.

INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line
while (INFILE.LINE != "") :

   # Read in GDE flat file
   GILST = IDLST()
   GETGDELIST(INFILE,NAMEFLAG,GILST)

   # Create a new list containing only those entries
   # whose length is less than MAXSEQ 
   #Retrieve the sequence and write it to outfile
   GETSEQS(GILST,P.METHOD,P.OFN)

INFILE.F.close()




