/*
 * GenBankFile.java
 *
 * Created on January 30, 2008, 11:58 AM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */

import org.biolegato.core.data.sequence.Sequence;
import org.biolegato.core.data.sequence.Sequence.Type;
import org.biolegato.core.data.sequence.Sequence.Topology;
import org.biolegato.core.data.sequence.Sequence.Strandedness;
import org.biolegato.core.main.BLMain;
import org.biolegato.core.plugintypes.DataFormat;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.StringTokenizer;

/**
 * This class acts as a parser/translator for GenBank files (using the file specification current to 2008).
 *
 * @author Graham Alvare
 * @author Brian Fristensky
 */
public class GenBankFile2008 extends DataFormat {

    /**
     * Creates a new instance of GenBankFile2008
     */
    public GenBankFile2008 () {
    }

    /**
     * Translates a sequence into the GenBank file format.
     *
     * @param seq the sequence to translate
     * @return the resulting string
     */
    public String translateTo (Sequence seq) {
        StringBuffer result = new StringBuffer();
        String sequenceLength;
        String sequence = "";
        java.util.Map sequenceData = null;

        // translate the data
        // NOTE: append is faster than + or concat operators
        if (seq != null) {
	    sequence = seq.get("sequence").toString();
            if ( Boolean.TRUE.equals(seq.get("modified")) || "".equals(seq.get("originalgb"))) {
		result.append("LOCUS       ").append((seq.get("name").toString() +
						      "                ").substring(
			0, 16)).append(" ");

		sequenceLength = "           " + sequence.length();
		result.append(sequenceLength.substring(Math.max(0,
								sequenceLength.length() -
								11)));

		if ( ! Type.PROTEIN.equals(seq.get("type"))) {
		    result.append(" bp ");
		    if (seq.get("strandedness") != null && seq.get("strandedness") instanceof Strandedness) {
			result.append(getGBStrandedness((Strandedness)seq.get("strandedness"))).append("-");
			result.append((!Type.RNA.equals(seq.get("type")) ? "DNA     " : "RNA     "));
		    } else {
			result.append((!Type.RNA.equals(seq.get("type")) ? "DNA        " : "RNA        "));
		    }
		} else {
		    result.append(" aa            ");
		}
		result.append((!Topology.CIRCULAR.equals(seq.get("topology")) ? "linear   "
			       : "circular "));
		if (seq.containsKey("classification")) {
		    result.append(seq.get("classification")).append(" ");
		} else {
		    result.append("CON "); // division code + space
		}
		if (seq.containsKey("creation-date")) {
		    result.append(seq.get("creation-date"));
		} else {
		    result.append((new SimpleDateFormat("dd-MMM-yyyy")).format(
			    new Date()).toUpperCase());
		}
		result.append("\n");
		if ( ! "".equals(seq.get("accession"))) {
		    result.append("ACCESSION   ").append(
			    seq.get("accession").toString()).append("\n");
		}
		if ( ! "".equals(seq.get("description"))) {
		    result.append("DESCRIPTION ").append(
			    seq.get("description").toString()).append("\n");
		}
		if ( ! "".equals(seq.get("GI"))) {
		    result.append("VERSION     ").append(( ! "".equals(seq.get(
			    "accession")) ? seq.get("accession").toString() +
						      ".1" : "A00001.1") + " GI:" +
							 seq.get("GI").toString() +
							 "\n");
		}
	    } else {
                result.append(seq.get("originalgb"));
            }
	    result.append("ORIGIN");
	    for (int count = 0; count < sequence.length(); count += 60) {
		result.append("\n").append(GBRightJustify("" + (count + 1)));
		for (int spaceCount = count; spaceCount < count + 60 &&
					     spaceCount < sequence.length();
		     spaceCount += 10) {
		    result.append(" ").append(sequence.substring(spaceCount,
								 Math.min(
			    sequence.length(), spaceCount + 10)));
		}
	    }
	    result.append("\n//\n");
        }
        return result.toString();
    }

    /**
     * Translates data in the GenBank file format to sequence objecte
     *
     * @param data the buffered reader to parse
     * @return the translated sequences
     * @throws IOException any exeptions that occur while reading the stream are passed
     */
    public Sequence[] translateFrom (java.io.BufferedReader data) throws
            IOException {
        String name = "";
        String value = "";
        StringBuffer valueBuffer = new StringBuffer();
        String line = "";
        String temp = "";
        StringBuffer original = new StringBuffer();
        int index = 0;
        int tabIndex = 0;
        int spaceIndex = 0;
        LinkedList<Sequence> result = new LinkedList<Sequence>();
        StringTokenizer tokenz = null;
        Hashtable<String, Object> sequence = new Hashtable<String, Object>();

        while ((line = data.readLine()) != null) {
            // get the positions of both the first space and first tab characters.
            spaceIndex = line.indexOf(' ');
            tabIndex = line.indexOf('\t');

            // determine the substring index
            if (tabIndex <= 0 || tabIndex <= 0) {
                index = Math.max(tabIndex, spaceIndex);
            } else {
                index = Math.min(tabIndex, spaceIndex);
            }

            if (index == 0 && name != null &&  ! "".equals(name)) {
                // NOTE: append is faster than + or concat operators
                valueBuffer.append(line.trim());
            } else if ( ! "".equals(line) && index != 0) {

                if ( ! "".equals(name)) {
                    value = valueBuffer.toString();
                    if ("sequence".equals(name)) {
                        value = value.replaceAll("[\\d\\s]", "");
                    }
                    sequence.put(name, value);
                    name = "";
                }

                if ("//".equals(line.trim())) {
                    sequence.put("originalgb", original);
                    result.add(new Sequence(sequence));
                    sequence.clear();
                    original = new StringBuffer();
                } else {
                    valueBuffer = new StringBuffer();
                    if (index > 0) {
                        // NOTE: append is faster than + or concat operators
                        name = line.substring(0, index).trim().toLowerCase();
                        valueBuffer.append(line.substring(index + 1).trim());
                        value = line.substring(index + 1).trim();
                    } else {
                        name = line.trim().toLowerCase();
                    }
                    if ("locus".equals(name)) {
			if (!sequence.isEmpty()) {
			    sequence.put("originalgb", original);
			    result.add(new Sequence(sequence));
			    sequence.clear();
			    original = new StringBuffer();
			    BLMain.warning("Genbank entry missing // terminator", "GenBankFile2008.class");
			}
                        tokenz = new StringTokenizer(value);
                        if (tokenz.hasMoreTokens()) {
                            sequence.put("name", tokenz.nextToken());
                        }
                        if (tokenz.hasMoreTokens()) {
                            tokenz.nextToken();    // this is the length (ignorable)
                        }
                        if (tokenz.hasMoreTokens()) {
                            temp = tokenz.nextToken();
                            if ("aa".equals(temp)) {    // this should be bp or aa
                                sequence.put("type", Type.PROTEIN);
                                tokenz.nextToken();
                            } else if ("bp".equals(temp) && tokenz.hasMoreTokens()) {
                                temp = tokenz.nextToken();
                                if (temp.contains("-")) {
                                    sequence.put("strandedness",
                                                 toStrandedness(
                                            temp.substring(0, temp.indexOf("-"))));
                                    sequence.put("type", toType(temp.substring(
                                            temp.indexOf("-") + 1)));
                                } else {
                                    sequence.put("type", toType(temp));
                                }
                            }
                        }

                        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
                        //***********************************************************************************************************//
                        //* NOTE: any tokens following the type SEEM to be optional, do not rely on them being in the files read in *//
                        //***********************************************************************************************************//
                        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
                        if (tokenz.hasMoreTokens() &&
				"circular".equals(tokenz.nextToken().toLowerCase())) {
                            sequence.put("topology", Sequence.Topology.CIRCULAR);
                        } else {
                            sequence.put("topology", Sequence.Topology.LINEAR);
                        }

                        if (tokenz.hasMoreTokens()) {
                            sequence.put("classification", tokenz.nextToken());
                        }

                        if (tokenz.hasMoreTokens()) {
                            sequence.put("creation-date", tokenz.nextToken());
                        }
                        name = "";
                    } else if ("version".equals(name) && value.contains("GI:")) {
                        sequence.put("GI",
                                     value.substring(value.indexOf("GI:") + 3).trim());
                    } else if ("origin".equals(name)) {
                        name = "sequence";
                        valueBuffer = new StringBuffer();
                    }
                }
            }
	    if (!"sequence".equals(name)) {
		original.append(line).append("\n");
	    }
        }
	
	if (!sequence.isEmpty()) {
	    sequence.put("originalgb", original);
	    result.add(new Sequence(sequence));
	    sequence.clear();
	    BLMain.warning("Genbank entry missing // terminator", "GenBankFile2008.class");
	}
        return result.toArray(new Sequence[0]);
    }

    /**
     * Determines whether or not a specified file is of type GenBank file (based on extension).
     * Currently the only extensions supported are ".gen", ".gp", and ".gb".
     *
     * @param file the file to test
     * @return true if the file is of type GenBank file (otherwise false)
     * @see javax.swing.filechooser.FileFilter#accept
     */
    public boolean accept (File file) {
        return (file.isDirectory() ||
                file.getAbsolutePath().toLowerCase().endsWith(".gen") ||
                file.getAbsolutePath().toLowerCase().endsWith(".gp") ||
                file.getAbsolutePath().toLowerCase().endsWith(".gb"));
    }

    /**
     * Obtains the internal name of the file format
     * (this is great for hashtables storage and searching).
     *
     * @return the name of the file format
     */
    @Override
    public String getName () {
        return "genbank";
    }

    /**
     * Returns a description of the file format that can be displayed to the user.
     *
     * @return the string description of the file format
     * @see javax.swing.filechooser.FileFilter#getDescription
     */
    public String getDescription () {
        return "GenBank file (*.gb,*.gp,*.gen)";
    }

    /**
     * Converts a sequence's strandedness enum to GenBank's format
     *
     * @param type the type enum to convert
     * @return the GenBank equivilent
     */
    private static String getGBStrandedness (Strandedness value) {
        String result = "ss";

        switch (value) {
            case DOUBLE:
                result = "ds";
                break;
            case MIXED:
                result = "ms";
                break;
        }
        return result;
    }

    /**
     * Used to convert GB's strandedness to BioLegato's strandedness structure
     *
     * @param string the string to convert.
     * @return the strandedness corresponding to the string parameter
     */
    private static Strandedness toStrandedness (String string) {
        Strandedness result = Strandedness.MIXED;
        if ("ss".equalsIgnoreCase(string)) {
            result = Strandedness.SINGLE;
        } else if ("ds".equalsIgnoreCase(string)) {
            result = Strandedness.DOUBLE;
        }
        return result;
    }

    /**
     * Used to convert GB's sequence type to BioLegato's type structure
     *
     * @param string the string to convert.
     * @return the type corresponding to the string parameter
     */
    private static Type toType (String string) {
        Type result = Type.DNA;
        if (string.toLowerCase().contains("rna")) {
            result = Type.RNA;
        }
        return result;
    }

    /**
     * Used to right justify the numbers for GenBank Sequences
     *
     * @param string the string to right justify.
     * @return the right justified string.
     */
    private String GBRightJustify (String string) {
        return (9 - string.length() > 0 ? ("            ").substring(0,
                                                                     9 -
                                                                     string.length())
                : "") + string;
    }

    /**
     * Used to auto-detect Bio Legato formats
     *
     * @param test the reader to parse data from
     * @return whether or not the format is correct
     */
    @Override
    public boolean isFormat (Reader test) {
        int check = ' ';
        boolean result = false;
        try {
            while (check == ' ' || check == '\t' || check == '\n' || check ==
                                                                     '\r') {
                test.mark(10);
                check = test.read();
            }
            if (check == 'L' || check == 'l') {
                check = test.read();
                if (check == 'O' || check == 'o') {
                    check = test.read();
                    if (check == 'C' || check == 'c') {
                        check = test.read();
                        if (check == 'U' || check == 'u') {
                            check = test.read();
                            if (check == 'S' || check == 's') {
                                result = true;
                            }
                        }
                    }
                }
            }
            test.reset();
        } catch (Throwable e) {
            e.printStackTrace();
        }
        return result;
    }

}
