#!/usr/local/bin/python
# Jyly 18, 2008, Dr. Brian Fristensky, University of Manitoba

" Description: Extract a subset of sequences from a GDE flatfile."


" Synopsis: BLExtractSubset.py namefile infile outfile"

# Files: namefile    list of sequence names to be written
#        infile      GDE flat file containing all sequences
#        outfile     GDE flat file containing subset of sequences listed in namefile

import sys
import string
import os.path
import shutil

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in namefile
def GETNAMES(FN) :

    FILE = open(FN,'r')
    NAMES = []
    for LINE in FILE :
          LINE = LINE.strip()
          if len(LINE) > 0 :
             NAMES.append(LINE)
          else :
	     pass

    FILE.close()
    return NAMES


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read each sequence in infile. If a sequence name is in the
# list, write the sequence to a separate file in the temporary
# directory
def GETSEQS(INFILE,TEMPDIR) :
    
    os.chdir(TEMPDIR)
    # create a dummy file just so that we have a file to close 
    # the first time the loop is executed. This also takes care
    # of files in which the first sequence begins after the first line
    OUTFILE = open('/dev/null','w') 
     
    for LINE in INFILE :
          LINE = LINE.strip()
          if len(LINE) > 0 :
             if LINE[0] in SEQFLAGS : #start a new output file
	        TOKENS = LINE.split()
                SEQNAME = TOKENS[0][1:]
		OUTFILE.close()
		OUTFILE = open(SEQNAME + '.flat','w')
		OUTFILE.write(LINE + '\n')
	     else : #copy the line to output file
	        OUTFILE.write(LINE + '\n')
          else :
	     pass

    INFILE.close()    


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Write only the sequences listed in namefile to the output file,
# in the order they appear in namefile
def WRITESEQS(NAMES,OFN) :
    OUTFILE = open(OFN,'w')
    if len(NAMES) > 0:
       for N in NAMES :
	  FN = N + '.flat'
	  if os.path.exists(FN) :
	     INFILE = open(FN,'r')
	     for LINE in INFILE :
	        OUTFILE.write(LINE)
	     INFILE.close()
    OUTFILE.close()	  
    os.chdir(CWD)
    
#======================== MAIN PROCEDURE ==========================

#---------- Set global variables
NFN = sys.argv[1]
IFN = sys.argv[2]
OFN = sys.argv[3]
OUTFILE = open(OFN,'w')
SEQFLAGS = (['>','#','%','"','@']) # indicate sequence name line
CWD = os.getcwd()

if os.path.exists(NFN) :

   # Read in namefile
   NAMES = GETNAMES(NFN)
   
   if os.path.exists(IFN) :
     #  Create a temporary working directory
     TEMPDIR = 'BLExtractSubset.' + str(os.getpid())
     os.mkdir(TEMPDIR,0700)     
     
     INFILE = open(IFN,'r')
     GETSEQS(INFILE,TEMPDIR)
     INFILE.close()     
     
     #Write sequence to outfile
     WRITESEQS(NAMES,CWD + '/' + OFN)
     
     # Clean up
     shutil.rmtree(TEMPDIR)

     
   else:
     print 'BLExtract Subset: File not found: ' + IFN   
   
else :
   print 'BLExtract Subset: File not found: ' + NFN




