

"""
Title: Fasta file sequence concatenation; Author:Justin Zhang; Date: Sep 28, 2010

Usage: python concat_seq.py --infile=<in_file> --outfile=<concat_file>(optional) --seqorderfile=<seqorder_file>(optional) --numNs=<number of Ns>(optional 1-1000)

Argument parameters:
infile - input fasta file (required); 
outfile - output file of concatenated sequence (optional)
seqorderfile: the list of sequences to be cocatenated, if not provided concatenating all the sequences(optional)
numNs - number of Ns (1-1000) between 2 joined sequences, default is 10 (optional)

Statistic output file: default created - a file named <infile_stat>, listed all the sequence name and length, counted number of sequences, average length

Usage cases: 
1. Only statistic output - python concat_seq.py --infile=<in_file>;
2. Concatenating all sequences - python concat_seq.py --infile=<in_file> --outfile=<concat_file> --numNs=<number of Ns>
3. Concatenating sequences listed in seq order file - python concat_seq.py --infile=<in_file> --outfile=<concat_file> --seqorderfile=<seqorder_file> --numNs=<number of Ns>

"""

import sys
sys.path.append('/home/psgendb/local/biopython-1.55') # append biopython lib path
import string
import os.path
import shutil
import re
from optparse import OptionParser
from Bio import SeqIO

def PrintUsage():
    print "Usage 1: python concat_seq.py --infile=<in_file> (for statistics info only)" + '\n'
    print "Usage 2: python concat_seq.py --infile=<in_file> --outfile=<concat_file> --numNs=<number of Ns>(optional 1-1000 default 10)(concatenate all the sequences in the order)" + '\n'
    print "Usage 3: python concat_seq.py --infile=<in_file> --outfile=<concat_file> --seqorderfile=<seqorder_file> --numNs=<number of Ns>(optional 1-1000 default 10)(concatenate list of sequences in seq order file)" + '\n'

#*********************************Main Function**************************************


parser = OptionParser()
parser.add_option("-o", "--outfile", dest="outfile",
                  action="store", type="string")
parser.add_option("-i", "--infile", dest="infile",
                  action="store", type="string")
parser.add_option("-s", "--seqorderfile", dest="orderfile",
                  action="store", type="string")
parser.add_option("-n", "--numNs", dest="numNs",
                  action="store", type="string")
parser.add_option("-t", "--fastaheader", dest="title",
                  action="store", type="string")

(options, args) = parser.parse_args()
if options.infile is None:
    PrintUsage()
    sys.exit(2) 
if options.outfile is None:
    blnConcate=False
else:
    blnConcate=True

if options.orderfile is None:
    blnOrder=False
else:
    blnOrder=True
    if not os.path.exists(options.orderfile):
        print "Sequence order file - " + options.orderfile + " doesnot exist." + '\n'
        PrintUsage()
        sys.exit(2)
numNs=10

if options.numNs is not None:
    blnRightNs=True
    intNs=0
    try:
       intNs=int(options.numNs)
       if not (intNs>0 and intNs<1001):
           print "numNs requires an integer in the range (1 to 1000)\n"
           blnRightNs=False
       else:
           numNs=intNs
    except:
       print "wrong numNs input\n"
       blnRightNs=False
    if not blnRightNs:
       sys.exit(2)

if not os.path.exists(options.infile):
    print "Input file", str(options.infile), "does not exist."
    sys.exit(2)
else:
    STATFILE=open(options.infile+"_stat",'w')
    STATFILE.write("Statistics of sequences in file - " + str(options.infile) + '\n')

title="Concatenated sequence in file - " + str(options.infile)
if options.title is not None:
    if str(options.title)=="":
        print "fasta header input is empty\n"
        print "continue with default header -" + title + " (enter yes): "
        proc=str(sys.stdin.readline())
        if proc != "yes":
            sys.exit(2)
    else:
        title= str(options.title)

try: # if outfile is an appropriate file name
    if blnConcate:
        OUTFILE=open(options.outfile,'w')
        OUTFILE.write(">" + title + '\n')
    blnFirst=True
    num_contig=0
    total_len=0
    SeqHash={'first_id00001':'123456'} #establish hash table
    intMin=1000000000000000
    intMax=0
    intShort=0
    intLong=0
    for seq_record in SeqIO.parse(options.infile, "fasta"):
        num_contig=num_contig+1
        total_len=total_len+len(seq_record)
        STATFILE.write(seq_record.id + '\t' + str(len(seq_record)) + '\n')
        if len(seq_record)<100:
            intShort=intShort+1
        if len(seq_record)>=300:
            intLong=intLong+1
        if len(seq_record)<intMin:
	    intMin=len(seq_record)
        if len(seq_record)>intMax:
            intMax=len(seq_record)
        if blnConcate:
            if not blnOrder:
 #               if not blnFirst:
 #                   OUTFILE.write('N'*numNs)
                OUTFILE.write(str(seq_record.seq)+'\n')
		OUTFILE.write('N'*numNs)
                blnFirst=False
            else:
                SeqHash[seq_record.id]=seq_record.seq
    if blnConcate:
        if blnOrder:
            blnFirst=True
            SEQFILE=open(options.orderfile,'r')
            for line in SEQFILE:
                w=re.search('\w+',line)
                if SeqHash[w.group(0)]==None:
                    qprint "Sequence - " + w.group(0) + " does not exist.\n"          
                else:
                    if not blnFirst:
                        OUTFILE.write('N'*numNs)
                    blnFirst=False
                    print "writting sequence - " + w.group(0) + '\n'
                    OUTFILE.write(str(SeqHash[w.group(0)]))
        OUTFILE.close()
    STATFILE.write("Number sequences: " + str(num_contig)+'\n')
    STATFILE.write("Average sequence length: " + str(int(total_len/num_contig))+'\n')
    STATFILE.write("Shortest sequence length: " + str(intMin)+'\n')
    STATFILE.write("Longest sequence length: " + str(intMax)+'\n')
    STATFILE.write("Number sequeces shorter than 100: " + str(intShort)+'\n')
    STATFILE.write("Number sequeces not shorter than 300: " + str(intLong)+'\n')
    STATFILE.close
except IOError:
    print "The file ", str(options.outfile), "cannot be open to write , exiting gracefully"
    sys.exit(2)


 
        


