#!/usr/local/bin/python
"""
 Oct. 6, 2010, Dr. Brian Fristensky, University of Manitoba
 dbsout.py - Extract hit lines or ID#'s from BLAST, GENPEPT or FASTA output
            Send output to files or windows.

 Synopsis:
  dbsout.py infile [-e ethreshold] -d destination [outfile]
       IFN - output from BLAST, GENPEPT or FASTA
       -e     - only select output for which the ETHRESHOLD is
                 <= ethreshold
       -d     - destination: one of the folloeing:
                 textedit - open output files in text editor
                   specified by the $GDE_TEXTEDIT environment variable
                files - write to files, using the basename
                 specified by destination.
                bldata - send hit lines to a bldata window
       outfile - basename for outputfile(s)

@modified: May 26 2010
@author: Dale Hamel
@contact: umhameld@cc.umanitoba.ca
"""
import sys
import os
import re
import shutil
import sys

blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod
from birchlib import Argument

PROGRAM = "dbsout.py : "
USAGE = "\n\tUSAGE: dbsout.py infile [-e ethreshold] -d destination [outfile]"

BM = Birchmod(PROGRAM, USAGE)



class Parameters:
    """
      	Wrapper class for command line parameters
      	By default, ETHRESHOLD is set to 10000, so that all
      	hits will be returned, if -e is not set at the command line
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
     		IFN=""
     		ETHRESHOLD=float(10000)
     		DESTINATION=""
     		OFN=""
     		PID=""
     		TWOIDS=""
     	  Then calls read_args() to fill in their values from command line
          """
        self.IFN = ""
        self.ETHRESHOLD = float(10000)
        self.DESTINATION = ""
        self.OFN = ""
        self.PID = ""
        self.TWOIDS = ""
        self.read_args()


    def read_args(self):
        """
        	Read command line arguments into a Parameter object

        	"""

        infile = Argument("", str, BM)
        ethresh = Argument("-e", float, BM)
        dest = Argument("-d", str, BM)
        outfile = Argument("", str, BM)

        infile.set_position(1)
        outfile.set_position(len(sys.argv) - 1)
        ethresh.set_optional()
	outfile.set_optional()
	
        try:
            self.IFN = infile.fetch()
            self.OFN = outfile.fetch()
            self.DESTINATION = dest.fetch()
	
            ethresh = ethresh.fetch()
	
            if(ethresh != None):
                self.ETHRESHOLD = ethresh
	
        except:
            BM.printusage()
	

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def READHITS(P, HITS):
    """
	Read hit lines from FASTA or BLAST output file
	"""
	
    def isGI(LINE):
        """
		return true if line is a BLAST-style hit eg.
		gi|169080|gb|AAA33662.1| disease resistance respo  ( 175) 1163 295.7 5.7e-79
		"""
        RESULT = 'false'
        RESULT = re.match('^[\w]{2,}\|\S*\|', LINE)
        return RESULT

    def isFASTA(LINE):
        """
		return true if line is a FASTA-style hit eg.
		AF141131 - Helianthus annuus cultivar Line HA  ( 439) [3]  564 146.8 9.3e-35   
		"""
        RESULT = 'false'
        RESULT = re.match('^[^>][\w]* - *', LINE)
        return RESULT

    def isGENPEPT(LINE):
        """
		return true if line is a GENPEPT-style hit eg.
                J02593_1 SRAAFP 213874  Sea raven (Hemitripter ( 195) [f] 1369 310.6 3.5e-82   
		"""
        RESULT = 'false'
        RESULT = re.match('^[^>][\w]*_[1-6] [\w]* ', LINE)
        return RESULT

    def EVALUE(LINE):
        """
		Return the E value from a hit line
		"""
        TOKENS = LINE.split(" ")
    # For floating point numbers <= 1e-100, BLAST
    # truncates the evalue to something line 'e-100'
    # This will cause an error when dbsout.py tries
    # to convert to a floating point number.
    # First, we try concatenating a '1' to the beginning
    # of the string. If that still produces an exception,
    # we assume the E Value is 0. The worst that can
    # happen is that we show an extra hit, and a bad
    # E value should be seen upon inspection of the output.
        ESTR = TOKENS[len(TOKENS) - 1]
        if ESTR[0] == 'e':
            ESTR = '1' + ESTR
        try:
            E = float(ESTR)
        except  ValueError:
            E = 0
        return E
	
    try:
        FILE = open(P.IFN, 'r')
        P.FILETYPE = ""
    except:
        BM.file_error(P.IFN)
	    
    #Find the first hit line, in either BLAST or FASTA format
    LINE = FILE.readline()
    FOUND = 0
    while not (LINE == '' or FOUND) :
          if isGI(LINE) :
             P.FILETYPE = 'BLAST'
             FOUND = 1
	  elif isFASTA(LINE) :
             P.FILETYPE = 'FASTA'
             FOUND = 1
	  elif isGENPEPT(LINE) :
             P.FILETYPE = 'GENPEPT'
             FOUND = 1
	  else:	     
	     LINE = FILE.readline()
	  
    # Keep reading lines until a non-hit line or end of file
    FINISHED = 0
    if P.FILETYPE == 'BLAST' :    
       while not (LINE == '' or FINISHED):
             if isGI(LINE) :
	        LINE = LINE.strip(" ")
        	if EVALUE(LINE) <= P.ETHRESHOLD :
		   HITS.append(LINE)
	        LINE = FILE.readline()
             else :
	        FINISHED = 1
    elif P.FILETYPE == 'FASTA' :
       while not (LINE == '' or FINISHED):
             if isFASTA(LINE) :
	     	LINE = LINE.strip(" ")
	        if EVALUE(LINE) <= P.ETHRESHOLD :
        	   HITS.append(LINE)
	        LINE = FILE.readline() 
             else :
	        FINISHED = 1   
    elif P.FILETYPE == 'GENPEPT' :
       while not (LINE == '' or FINISHED):
             if isGENPEPT(LINE) :
	     	LINE = LINE.strip(" ")
	        if EVALUE(LINE) <= P.ETHRESHOLD :
        	   HITS.append(LINE)
	        LINE = FILE.readline() 
             else :
	        FINISHED = 1 	      
    FILE.close()
    return
	
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Identifiers:
    """
      For a given database, a list of IDs from hit lines
      """
    def __init__(self):
        """
          Initializes arguments:
     		db=""
     		list=[]
     		ofn=""
          """
        self.db = ""
        self.list = []
        self.ofn = ""

    def writeFile(self, ONAME):
        """
          @param ONAME: Name of the output file
          @type ONAME:str
          """
        self.ofn = ONAME + '.' + self.db
        OUTFILE = open(self.ofn, 'w')
        for J in self.list:
            OUTFILE.write(J + "\n")
        OUTFILE.close()
        return


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEBLAST(P, HITS, ID1, ID2):
    """	
    Parse database identifiers from BLAST hit lines
    """
    # Parse the first hit line to find out how many sets of
    # identifiers there are, and which databases they are from
    TOKENS = HITS[0].split("|")
    P.TWOIDS = 0
    P.TWOIDS = re.match('^[\w]{2,}\|\S*\|[\w]{2,}\|\S*\|', HITS[0])
    ID1.db = TOKENS[0]
    if P.TWOIDS:
        ID2.db = TOKENS[2]

    # Now, process the entire set of hits. Extract identifiers
    # from each hit line and add them to the ID list(s).          
    for J in HITS:
        TOKENS = J.split("|")
	ID1.list.append(TOKENS[1])
        if P.TWOIDS and len(TOKENS) >= 4:
            ID2.list.append(TOKENS[3])
	           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEFASTA(HITS, ID1):
    """
    Parse database identifiers from FASTA hit lines
    """
    ID1.db = 'nam'
    for J in HITS:
        TOKENS = J.split(" ")
	ID1.list.append(TOKENS[0])           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEGENPEPT(P, HITS, ID1, ID2):
    """	
    Parse database identifiers from GENPEPT hit lines
    """
    # Parse the first hit line to find out how many sets of
    # identifiers there are
    TOKENS = HITS[0].split(" ")
    P.TWOIDS = 0
    P.TWOIDS = re.match('^[^>][\w]*_[1-6] [\w]* ', HITS[0])
    ID1.db = 'gp'
    if P.TWOIDS:
        ID2.db = 'gb'

    # Now, process the entire set of hits. Extract identifiers
    # from each hit line and add them to the ID list(s).          
    for J in HITS:
        TOKENS = J.split(" ")
	ID1.list.append(TOKENS[0])
        if P.TWOIDS and len(TOKENS) >= 4:
            ID2.list.append(TOKENS[1])
	           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def RUNTEXTEDIT(OFN):
    """
    Run the texteditor in the background and
    remove the temporary file when done
    """
    COMMAND = '(`choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ')&'
    # It's surprising how many issues there are with launching multiple
    # files in a text editor. choose_edit_wrapper.sh takes care of
    # these issues.
    #COMMAND = '($GDE_TEXTEDIT ' + OFN + '; $RM_CMD ' + OFN + ')&'
    os.system(COMMAND)           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def RUNDGDE(P, ID1, ID2):
    """
    Run the dgde in the background and 
    remove the temporary file when done
    """
    FLATFILE = 'dbsout.' + P.PID + '.flat'
    COMMAND = 'python $BIRCH/script/list2flat.py ' + ID1.ofn + ' ' + FLATFILE
    os.system(COMMAND)
    if P.TWOIDS:
        TEMPFILE = P.PID + '.temp'
        COMMAND = 'python $BIRCH/script/list2flat.py ' + ID2.ofn + ' ' + TEMPFILE
        os.system(COMMAND)
        COMMAND = 'cat ' + TEMPFILE + '>> ' + FLATFILE
        os.system(COMMAND)
        os.remove(TEMPFILE)

    COMMAND = '(dgde ' + FLATFILE + '; $RM_CMD ' + FLATFILE + ')&'
    os.system(COMMAND)           	
    return

           
#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    P = Parameters ()
	
	
    # Read hit lines from FASTA,BLAST or GENPEPT output file
    HITS = []
    READHITS(P, HITS)
	
    # Parse out the database identifiers from hit lines.
    # BLAST and GENPEPT hit lines may have two lists of identifiers.
    # FASTA hit lines have one.
    ID1 = Identifiers ()
    ID2 = Identifiers ()
	
    P.PID = str(os.getpid())
    if P.FILETYPE == 'BLAST':
        PARSEBLAST(P, HITS, ID1, ID2)
    elif P.FILETYPE == 'FASTA':
        PARSEFASTA(HITS, ID1)
    elif P.FILETYPE == 'GENPEPT':
        PARSEGENPEPT(P, HITS, ID1, ID2)
	
    # Write the output to a file, or send it to a window, as specified
    # in -d destination
	
    if P.DESTINATION == 'textedit':
        TEMPOFN = P.PID + '.' + 'outfile'
        shutil.copy(P.IFN, TEMPOFN)
        RUNTEXTEDIT(TEMPOFN)
        ID1.writeFile(P.PID)
        RUNTEXTEDIT(ID1.ofn)
        if P.TWOIDS:
            ID2.writeFile(P.PID)
            RUNTEXTEDIT(ID2.ofn)
        RUNDGDE(P, ID1, ID2)
	       
    elif P.DESTINATION == 'files':
        shutil.copy(P.IFN, P.OFN)
        ID1.writeFile(P.OFN)
        if P.TWOIDS:
            ID2.writeFile(P.OFN)
	    
    elif P.DESTINATION == 'bldata':
        print 'output to bldata not yet supported'
	   
    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()
