#!/usr/bin/python

# uniqid.py - Read a source file and replace each definition line with a unique
#             identifier. Store the unique ID and original definition line 
#             in a .csv file as a key-value pair.

# Version  7  Feb  2010

# Synopsis:
#   uniqid.py [options]  -encode sourcein sourceout csvout
#   uniqid.py [options]  -decode textin textout csvin
#
#        -encode (default)   options begin with a dash; filenames do not
#                            The first three filenames on the command line 
#                            are read as sourcein, the original source file;
#                            sourceout, the sourcefile sequences in which the
#                            description line is replaced with a unique ID;
#                            and csvout, a comma-separated value file containing
#                            the unique identifier and the corresponding
#                            definition line 
#      
#        -decode             options begin with a dash; filenames do not
#                            The first three filenames on the command line 
#                            are read as textin, any text file containing
#                            unique IDs generated from a previous run using 
#                            -encode; textout the output file in which the
#                            unique ID is replaced by the original name, or
#                            the name plus parts of the definition line; csvin, 
#                            the csv file generated by a previous run using
#                            -encode.
#
#        -f list_of_fields   similar to -f in the Unix cut
#                            command. A comma-separated list of fields to be
#                            written to textout when decoding files.
#
#        -s seperator        seperator is a character to use as the seperator 
#                            when parsing a definition line into fields. 
#                            default = " ", a blank space
#
#        -nf string          string is one or more characters to begin the 
#                            unique identifier, which which the definition
#                            line is replaced.     
#
# Idea for more general version of program:
#   An option lets you input a regular expression that is used for
#   finding the original ID, rather than just hardwiring fasta format
#   into the program. The program will still default to search for fasta
#   sequence names, but by employing regular expressions, uniqid.py
#   can perform substitutions in ANY type of file. Probably not hard
#   to implement, either. 

import operator
import random
import bisect
import math
import sys
import string
import re

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters :
      "Wrapper class for command line parameters"
      def __init__(self) :
          self.FIELDS = [1] # list of fields to parse with f option
          self.SEP = " " # seperator for parsing fields from the def. line
          self.NAMEFLAG = "!_" # all IDs begin with this string              
          self.ENCODE = bool(True)
          self.SOURCEIN = ""
          self.SOURCEOUT = "" 
          self.CSVIN = ""          
          self.TEXTIN = ""
          self.TEXTOUT = ""
          self.CSVOUT = "" 
          self.CSVSEP = "\t"
      
                   

      def ReadArgs(self) :
      
         "Remove leading and trailing quotes from a string"
         def Unquote(STR) :          
             if STR[0] == '"' :
                S = STR.replace('"','')
             else :
                S = STR.replace("'","")
             return S    
                  
         "Read command line arguments into a Parameter object"
         NUMARGS = len(sys.argv)
         if NUMARGS > 1 :
            I = int(1)
            while (I < NUMARGS) :              
              if sys.argv[I] == "-f" :
                 if I < NUMARGS :
                    I = I + 1
                    FIELDSTRING = sys.argv[I]
                    self.FIELDS = FIELDSTRING.split(',')
                    I = I + 1
                 else :
                    print ("uniqid.py: fields specified for -f")
              elif sys.argv[I] == "-s" :
                 if I < NUMARGS :
                    I = I + 1
                    SEPSTR = Unquote(sys.argv[I])                    
                    self.SEP = SEPSTR[0]
                    I = I + 1
                 else :
                    print ("uniqid.py: No value specified for -s")
              elif sys.argv[I] == "-nf" :
                 if I < NUMARGS :
                    I = I + 1
                    self.NAMEFLAG = Unquote(sys.argv[I])                    
                    I = I + 1
                 else :
                    print ("uniqid.py: No value specified for -nf")
              elif sys.argv[I] == "-encode" :
                 self.ENCODE = True 
                 I = I + 1
              elif sys.argv[I] == "-decode" :
                 self.ENCODE = False 
                 I = I + 1
              elif self.ENCODE :
                  if self.SOURCEIN == "" :
                     self.SOURCEIN = sys.argv[I]
                     I = I + 1
                  elif self.SOURCEOUT == "" :
                     self.SOURCEOUT = sys.argv[I]
                     I = I + 1
                  elif self.CSVOUT == "" :
                     self.CSVOUT = sys.argv[I]
                     I = I + 1
              else :
                  if self.TEXTIN == "" :
                     self.TEXTIN = sys.argv[I]
                     I = I + 1
                  elif self.TEXTOUT == "" :
                     self.TEXTOUT = sys.argv[I]
                     I = I + 1
                  elif self.CSVIN == "" :
                     self.CSVIN = sys.argv[I]
                     I = I + 1	     

         #for F in self.FIELDS :
         print self.FIELDS
         print self.SEP
         print self.NAMEFLAG             
         print self.ENCODE
         print self.SOURCEIN
         print self.SOURCEOUT
         print self.CSVOUT        
         print self.TEXTIN
         print self.TEXTOUT
         print self.CSVIN   	  

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# For each sequence in sourcein, write the sequence to sourceout
# replacing the definition line with a unique identifier. 
# Write the unique identifier and the definition line to csvout
# 

def EncodeNames(NAMEFLAG,SOURCEIN,SOURCEOUT,CSVOUT,CSVSEP) :

    # Choose a random number, and make sure it hasn't already
    # been recorded in NAMELIST. If we just kept appending
    # to NAMELIST, checking to see if a number was already used
    # would become inefficient for large lists. We want to keep
    # adding numbers to the list so that it stays sorted.
    # bisect_left module lets us insert each new value into an
    # already-sorted list to the left of the next-highest value.
    # Thus, the list always stays sorted. See
    # http://www.doughellmann.com/PyMOTW/bisect/index.html 
    def GetUniqID(NAMELIST) :
        DONE = bool(False)
        ID = str(random.randint(1,9999999)) 
        while not DONE :
          if ID in NAMELIST :
              ID = str(random.randint(1,9999999)) 
          else :    
              bisect.insort_left(NAMELIST,ID)
              DONE = bool(True)             
        return ID
    
    # create a dummy file just so that we have a file to close 
    # the first time the loop is executed. This also takes care
    # of files in which the first sequence begins after the first line
    FIN = open(SOURCEIN,'r')
    FOUT = open(SOURCEOUT,'w') 
    COUT = open(CSVOUT,'w')
    
    NAMELIST = []
     
    for LINE in FIN :
        LINE = LINE.strip()
        if len(LINE) > 0 :
           if LINE[0] == '>' : #new sequence
              UNIQNAME = NAMEFLAG + GetUniqID(NAMELIST)
              FOUT.write('>' + UNIQNAME + '\n')
              # > is not considered as part of the definition line
              # so we drop the first char. of LINE
              COUT.write(UNIQNAME + CSVSEP + LINE[1:] + '\n') 
              #print UNIQNAME + CSVSEP + LINE[1:] + '\n'              
           else : #copy the line to output file
	      FOUT.write(LINE + '\n')

    FIN.close()    
    FOUT.close()
    COUT.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# For each occurence of a name in textin, write the sequence to sourceout
# replacing the definition line with a unique identifier. 
# Write the unique identifier and the definition line to csvout
# 

def DecodeNames(SEP,NF,FIELDS,TEXTIN,TEXTOUT,CSVIN) :
    TIN = open(TEXTIN,'r')
    TOUT = open(TEXTOUT,'w') 

    # read unique ids and corresponding definition lines
    # from CSV in into a dictionary    
    def ReadDefLines(CSVIN) :    
        DICT = {}
        CIN = open(CSVIN,'r')  
        for LINE in CIN :
            TOKENS = LINE.strip().split('\t')
            DICT[TOKENS[0]] = TOKENS[1]
        CIN.close()
        return DICT
             
    # Get one or more fields from a definition line, using
    # SEP as the field seperator
    def GetFields(DEF,FIELDS,SEP) :
        STR = ""
        TOKENS = DEF.split(SEP)
        print TOKENS
        LEN = len(TOKENS)
        if LEN > 0 :
           J = int(1)
           print J
           STR = TOKENS[int(FIELDS[J-1])-1]
           J = J + 1
           while J < LEN :
               print J
               STR = STR + SEP + TOKENS[int(FIELDS[J-1])-1]
               J = J + 1
        return STR
        

    # Read in key-value pairs of unique IDs and definition lines
    IDDICT = ReadDefLines(CSVIN)

    # for each line in the file, replace the ID with the original
    # definition line
    for LINE in TIN :
        OUTPUTLINE = LINE.strip()
        if OUTPUTLINE.find(NF) != -1 :
           # Find each occurence of
           print 'Original: ' + OUTPUTLINE
           for K in IDDICT.keys() :
               STR = GetFields(IDDICT[K],FIELDS,SEP)
               OUTPUTLINE = OUTPUTLINE.replace(K,STR)
           print 'Modified: ' + OUTPUTLINE
        TOUT.write(OUTPUTLINE + '\n')

    TIN.close()    
    TOUT.close()


#======================== MAIN PROCEDURE ==========================
P = Parameters ()
P.ReadArgs()

if P.ENCODE :
   EncodeNames(P.NAMEFLAG,P.SOURCEIN,P.SOURCEOUT,P.CSVOUT,P.CSVSEP)
else :
   DecodeNames(P.SEP,P.NAMEFLAG,P.FIELDS,P.TEXTIN,P.TEXTOUT,P.CSVIN)


