#!/usr/local/bin/python
# February 22, 2006, Dr. Brian Fristensky, University of Manitoba

# Description: Given a file of ID numbers, return IDs or sequences in
# GenBank or Fasta format.  The
# List methods are unreliable for very large lists. If even
# one sequence can't be returned, nothing is returned. Instead
# we retrieve one sequence at a time, so that the impact of
# a failure is minimized.

# Synopsis: SHGet.py gifile -m method -e extension outfile

# options: infile    GDE flat file, with name on line 1, followed by
#                    comma separated list of tokens  or GI numbers
#          -e        extension to use for names eg. gi, taxid
#          -mn        SeqHound Method (eg.SHoundGetFasta, SHoundSequenceLength)
#        outfile     GenBank or FASTA file, list file

import sys
import string
import os

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters :
      "Wrapper class for command line parameters"
      def __init__(self) :
          self.HELP = 'n'
          self.IFN = ""
	  self.METHOD = ""
	  self.EXT = ""
	  self.OFN = ""
	  self.SEQMETHODS = ['SHoundGetFasta','SHoundGetGenBankff','SHoundGetXMLSeqEntry']
          self.SEQLISTMETHODS = ['SHoundGetFastaList','SHoundGetGenBankffList']

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def READARGS(P) :
    "Read command line arguments into a Parameter object"
    NUMARGS = len(sys.argv)
    if NUMARGS >= 1 :
       if sys.argv[1] == "-h" :
          P.HELP = 'y'
       else :
          P.IFN = sys.argv[1]

	  I = 2
	  while (I < NUMARGS)  :
		if sys.argv[I] == "-mn" :
        	   I = I + 1
        	   if I < NUMARGS :
        	      P.METHOD = sys.argv[I]
		      I = I + 1
		if sys.argv[I] == "-e" :
        	   I = I + 1
        	   if I < NUMARGS :
	              P.EXT = sys.argv[I]
		      I = I + 1
		if sys.argv[I] == "-h" :
        	   I = I + 1
        	   P.HELP = 'y'
		else :
        	   P.OFN = sys.argv[I]
        	   I = NUMARGS	             	
    return
	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class FILE :
      "Wrapper class for files"
     
      def __init__(self,FILENAME,MODE) :
          self.FN = FILENAME
	  self.F = open(FILENAME,MODE)
	  self.LINE = "" # most recent line read

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class IDLST :
      "Wrapper class for ID lists"
     
      def __init__(self) :
          self.NAME = ""
	  self.STR = ""
	  self.LST = []

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -	
# Print usage message
def PRINTHELP() :
    print('Usage:')
    print('python SHGet.py infile -mn SeqHoundMethod -e extension outfile')
    print('	infile: GDE flatfile of identifieres eg. gi, taxid')
    print('	SeqHoundMethod: any method, as defined in the SeqHound API')
    print('		(see http://seqhound.blueprint.org)')
    print('	extension: string to use as a name extension (eg. gi,len)')
    print('		The extension helps identify the type of output')
    print('	outfile: output file of ids or sequences')
    print('')
    	  	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in old and new strings, striping 
# leading and trailing whitespace, including
# newline characters.
def GETGDELIST(INFILE,NAMEFLAG,GILST) :

    # Read name line
    while (INFILE.LINE != "" and GILST.NAME == "") :
          INFILE.LINE = INFILE.LINE.strip()
	  if len(INFILE.LINE) > 0 :
             if INFILE.LINE[0] == NAMEFLAG :
                GILST.NAME = INFILE.LINE[1:]
          INFILE.LINE = INFILE.F.readline()	  
    
    # Read GI list 
    GILST.STR = ""   
    if GILST.NAME != "" :
       # GDE wraps the flat file with newlines every 60
       # characters. 
       # Next, we have to delete the newlines to turn the entire
       # file into a single long string called BIGLINE
       BIGLINE = ""  
       DONE =  0     
       while (INFILE.LINE != "" and DONE ==0) :
             TMPLINE = INFILE.LINE.strip()
	     if len(TMPLINE) > 0 :
	        if TMPLINE[0] == NAMEFLAG :
		   DONE = 1 
	        else:
		   BIGLINE = BIGLINE + TMPLINE
	           INFILE.LINE = INFILE.F.readline()

       # parse the string as a comma separated list
       GILST.STR = BIGLINE

    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Retrieve the sequences and write them to outfile
def GETSEQS(GILST,METHOD,OPTION,OFN) :

    print METHOD
    # Create a temporary filename
    TFN = 'SHGet.' + str(os.getpid())
    
    LEN = len(GILST.STR)
    if  LEN > 0 :
       COMMAND = 'leash -mn ' + METHOD + OPTION + GILST.STR + ' -of ' + TFN           
       os.system(COMMAND)

       OKAY = False
       if os.path.exists(TFN) :
	  if os.path.getsize(TFN) > 0 :
	     OKAY = True
             # Append the contents of the temporary file to the output file
             COMMAND = 'cat ' + TFN + '>> ' + OFN
	     os.system(COMMAND)
	     os.remove(TFN)	          
    return 
    
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Retrieve the IDs and write them to outfile
def GETIDS(GILST,METHOD,OPTION,EXT,OFN) :

    print METHOD
    # Create a temporary filename
    TFN = 'SHGet.' + str(os.getpid())
    L = GILST.NAME.rfind('.')
    if L > -1 :
       NEWNAME = GILST.NAME[:L] + '.' + EXT
    else :
       NEWNAME = GILST.NAME + '.' + EXT
    
    print NEWNAME
    LEN = len(GILST.STR)
    if  LEN > 0 :
       # The -sep option in Leash2.2 is broken. Until it's fixed,
       # we'll have to do a workaround. 
       TTFN = TFN + 'temp'
       COMMAND = 'leash -mn ' + METHOD + OPTION + GILST.STR + ' -sep comma ' + ' -of ' + TTFN           
       os.system(COMMAND)
       COMMAND = 'tr \"\\n\" \",\" < ' + TTFN + ' > ' + TFN         
       os.system(COMMAND)
       os.remove(TTFN)
       
       OKAY = False
       if os.path.exists(TFN) :
	  if os.path.getsize(TFN) > 0 :
	     OKAY = True
             # Append the contents of the temporary file to the output file
	     # The first command writes a name in the form
	     # "name
             COMMAND = 'echo \\' + NAMEFLAG + NEWNAME + ' >> ' + OFN
	     os.system(COMMAND)
             COMMAND = 'cat ' + TFN + '>> ' + OFN
	     os.system(COMMAND)
             COMMAND = 'echo \"\" >> ' + OFN
	     os.system(COMMAND)
	     os.remove(TFN)	          
    return 
    
       
#======================== MAIN PROCEDURE ==========================
P = Parameters ()

READARGS(P)

if P.HELP == 'y' :
   PRINTHELP()
elif not (os.path.exists(P.IFN)) :
   print 'SHGet.py: ' + P.IFN + ' not found'
else:

    #---------- Set global constants
    NAMEFLAG = '"'  # 1st character on the name line, indicating
                    # the beginning of the next data list

    MAXSEQ = 1000000 # maximum sequence length that SeqHound can retrieve

    INFILE = FILE(P.IFN,'r')

    # ------------------------- MAIN LOOP -----------------------
    # GDE flatfile may contain 0 or more lists, so we iterate
    # for each list.
    # Note that GETGDELIST takes care of reading in the next
    # input line.

    INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line
    while (INFILE.LINE != "") :

       # Read in GDE flat file
       GILST = IDLST()
       GETGDELIST(INFILE,NAMEFLAG,GILST)
       print GILST.NAME

       # Create a new list containing only those entries
       # whose length is less than MAXSEQ 
       #Retrieve the sequence and write it to outfile
       if P.METHOD in P.SEQMETHODS :
	  GETSEQS(GILST,P.METHOD,' -mpil ',P.OFN)
       elif P.METHOD in P.SEQLISTMETHODS :
	  GETSEQS(GILST,P.METHOD,' -mpi ',P.OFN) 
       elif P.METHOD.endswith('List')  :
	  GETIDS(GILST,P.METHOD,' -mpi ',P.EXT,P.OFN)
       else :
	  GETIDS(GILST,P.METHOD,' -mpil ',P.EXT,P.OFN) 

    INFILE.F.close()




