package org.biolegato.gdesupport.files;
/*
 * GenBankFile.java
 *
 * Created on January 30, 2008, 11:58 AM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */

import org.biolegato.gdesupport.data.Seq;
import org.biolegato.gdesupport.data.Seq.Type;
import org.biolegato.gdesupport.data.Seq.Topology;
import org.biolegato.gdesupport.data.Seq.Strandedness;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
import org.biolegato.gdesupport.data.Seq.Direction;
import org.biolegato.gdesupport.data.Dataset;
import org.biolegato.main.BLMain;

/**
 * This class acts as a parser/translator for GenBank files (using the file specification current to 2008).
 *
 * @author Graham Alvare
 * @author Brian Fristensky
 */
public class GenBankFile2008 extends DataFormat {

    private final static String SPACE_FILL = "                                 ";
    public final static int AVG_SEQ_LENGTH = 2000;

    /**
     * Creates a new instance of GenBankFile2008
     */
    public GenBankFile2008() {
    }

    /**
     * Translates a sequence into the GenBank file format.
     *
     * @param seq the sequence to translate
     * @return the resulting string
     */
    public void translateTo(Appendable result, Seq seq, int offset, int length) throws IOException {
        int digits;
        int namelength;
        StringBuffer sequence;
        final int sequencemax = offset + length;
        int sequenceLength;

        // translate the data
        // NOTE: append is faster than + or concat operators
        if (seq != null && offset >= 0 && length >= 0) {
            sequence = seq.getSequence();
            sequenceLength = (sequencemax > sequence.length() ? sequence.length()
                    : sequencemax);

            if (seq.getOriginal() == null || offset != 0 || length != sequence.length()) {
                result.append("LOCUS       ");

                namelength = seq.getName().length();
                if (namelength < 16) {
                    result.append(seq.getName()).append(SPACE_FILL, 0, 16 - namelength);
                } else {
                    result.append(seq.getName().substring(0, 16));
                }

                result.append(" ");

                digits = BLMain.numberDigits(length);
                if (digits < 9) {
                    result.append(SPACE_FILL, 0, 11 - digits);
                }
                result.append(Integer.toString(length));

                if (!Type.PROTEIN.equals(seq.getType())) {
                    result.append(" bp ");
                    if (seq.getStrandedness() != null) {
                        result.append(getGBStrandedness(seq.getStrandedness())).append("-");
                        result.append((!Type.RNA.equals(seq.getType()) ? "DNA     " : "RNA     "));
                    } else {
                        result.append((!Type.RNA.equals(seq.getType()) ? "DNA        " : "RNA        "));
                    }
                } else {
                    result.append(" aa            ");
                }
                result.append((!Topology.CIRCULAR.equals(seq.getTopology()) ? "linear   "
                        : "circular "));
// TODO: reimplement classification reading abilities for BioLegato
/*                if (seq.containsKey("classification")) {
                result.append(seq.get("classification")).append(" ");
                } else {*/
                result.append("CON "); // division code + space
//                }
                result.append((new SimpleDateFormat("dd-MMM-yyyy")).format(
                        new Date()).toUpperCase());
                result.append("\n");
                if (!"".equals(seq.getDescription())) {
                    result.append("DESCRIPTION ").append(
                            seq.getDescription()).append("\n");
                }
            } else {
                result.append(seq.getOriginal());
            }
            result.append("ORIGIN");

            for (int count = offset; count < sequenceLength; count += 60) {
                digits = BLMain.numberDigits(count + 1);

                result.append("\n");
                if (digits < 9) {
                    result.append(SPACE_FILL, 0, 9 - digits);
                }
                result.append(Integer.toString(count + 1));

                for (int spaceCount = count, next = count + 10; spaceCount < count + 60
                        && spaceCount < sequenceLength;
                        spaceCount = next, next += 10) {
                    result.append(" ").append(sequence, spaceCount,
                            // math.min
                            (sequenceLength < next ? sequenceLength : next));
                }
            }
            result.append("\n//\n");
        }
    }

    /**
     * Translates data in the GenBank file format to sequence objecte
     *
     * @param data the buffered reader to parse
     * @return the translated sequences
     * @throws IOException any exeptions that occur while reading the stream are passed
     */
    public void translateFrom(Dataset datamodel, java.io.BufferedReader data) throws
            IOException {
        int dashindex;
        int length = 0;
        int y = datamodel.getSize();
        String line = "";
        String temp = "";
        Type type = Type.DNA;
        String name = "";
        Direction direction = Direction.FROM5TO3;
        Strandedness strandedness = Strandedness.SINGLE;
        Topology topology = Topology.LINEAR;
        StringBuilder original = new StringBuilder();

        while ((line = data.readLine()) != null) {
            if (line.length() >= 6 && "ORIGIN".equalsIgnoreCase(line.substring(0, 6))) {
                StringBuffer sequencebuffer = new StringBuffer((length > 0 ? length : AVG_SEQ_LENGTH));
                while ((line = data.readLine()) != null && !"//".equals(line)) {
                    sequencebuffer.append(line.replaceAll("[\\d\\s]", ""));
                }
                datamodel.addSequence(y, new Seq(type, name, sequencebuffer,
                        direction, topology, strandedness, original));
                y++;
                type = Type.DNA;
                name = "";
                direction = Direction.FROM5TO3;
                strandedness = Strandedness.SINGLE;
                topology = Topology.LINEAR;
                original = new StringBuilder();
            } else {
                original.append(line).append("\n");
                if (line.length() > 5 && "LOCUS".equalsIgnoreCase(line.substring(0, 5))) {
                    StringTokenizer tokenz = new StringTokenizer(line.substring(6).trim());
                    if (tokenz.hasMoreTokens()) {
                        name = tokenz.nextToken();
                    }
                    if (tokenz.hasMoreTokens()) {
                        temp = tokenz.nextToken();    // this is the length (ignorable)
                        if (BLMain.testNumber(temp)) {
                            try {
                                length = Integer.parseInt(temp);
                            } catch (NumberFormatException nfe) {
                                BLMain.error("Invalid sequence length", "GenBank file parser");
                            }
                        }
                    }
                    if (tokenz.hasMoreTokens()) {
                        temp = tokenz.nextToken();
                        if ("aa".equals(temp)) {    // this should be bp or aa
                            type = Type.PROTEIN;
                        } else if ("bp".equals(temp) && tokenz.hasMoreTokens()) {
                            temp = tokenz.nextToken();
                            dashindex = temp.indexOf('-');
                            if (dashindex >= 0) {
                                strandedness =
                                        toStrandedness(
                                        temp.substring(0, dashindex));
                                type = toType(temp.substring(
                                        dashindex + 1));
                            } else {
                                type = toType(temp);
                            }
                        }
                    }

                    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
                    //***********************************************************************************************************//
                    //* NOTE: any tokens following the type SEEM to be optional, do not rely on them being in the files read in *//
                    //***********************************************************************************************************//
                    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
                    if (tokenz.hasMoreTokens()) {
                        temp = tokenz.nextToken();
                        if ("circular".equals(temp.toLowerCase())) {
                            topology = Seq.Topology.CIRCULAR;
                        } else {
// TODO: reimplement classifications
//                            if (!"linear".equals(temp.toLowerCase())) {
//                                    sequence.put("classification", temp);
//                            } else {
//                                if (tokenz.hasMoreTokens()) {
//                                        sequence.put("classification", tokenz.nextToken());
//                                }
//                            }
                            topology = Seq.Topology.LINEAR;
                        }
                    }

                    /*if (tokenz.hasMoreTokens()) {
                    creationDate = new Date(tokenz.nextToken());
                    }*/
                }
            }
        }
    }

    /**
     * Determines whether a specified file is of type GenBank file (based on extension).
     * Currently the only extensions supported are ".gen", ".gp", and ".gb".
     *
     * @param file the file to test
     * @return true if the file is of type GenBank file (otherwise false)
     * @see javax.swing.filechooser.FileFilter#accept
     */
    public boolean accept(File file) {
        return (file.isDirectory()
                || file.getAbsolutePath().toLowerCase().endsWith(".gen")
                || file.getAbsolutePath().toLowerCase().endsWith(".gp")
                || file.getAbsolutePath().toLowerCase().endsWith(".gb"));
    }

    /**
     * Returns a description of the file format that can be displayed to the user.
     *
     * @return the string description of the file format
     * @see javax.swing.filechooser.FileFilter#getDescription
     */
    public String getDescription() {
        return "GenBank file (*.gb,*.gp,*.gen)";
    }

    /**
     * Converts a sequence's strandedness enum to GenBank's format
     *
     * @param type the type enum to convert
     * @return the GenBank equivilent
     */
    private static String getGBStrandedness(Strandedness value) {
        String result = "ss";

        switch (value) {
            case DOUBLE:
                result = "ds";
                break;
            case MIXED:
                result = "ms";
                break;
        }
        return result;
    }

    /**
     * Used to convert GB's strandedness to BioLegato's strandedness structure
     *
     * @param string the string to convert.
     * @return the strandedness corresponding to the string parameter
     */
    private static Strandedness toStrandedness(String test) {
        Strandedness result = Strandedness.MIXED;
        if ("ss".equalsIgnoreCase(test)) {
            result = Strandedness.SINGLE;
        } else if ("ds".equalsIgnoreCase(test)) {
            result = Strandedness.DOUBLE;
        }
        return result;
    }

    /**
     * Used to convert GB's sequence type to BioLegato's type structure
     *
     * @param string the string to convert.
     * @return the type corresponding to the string parameter
     */
    private static Type toType(String string) {
        Type result = Type.DNA;
        if (string.toLowerCase().contains("rna")) {
            result = Type.RNA;
        }
        return result;
    }

    /**
     * Used to auto-detect Bio Legato formats
     *
     * @param test the reader to parse data from
     * @return whether the format is correct
     */
    @Override
    public boolean isFormat(Reader test) {
        int check = ' ';
        boolean result = false;
        try {
            while (check == ' ' || check == '\t' || check == '\n' || check
                    == '\r') {
                test.mark(10);
                check = test.read();
            }
            if (check == 'L' || check == 'l') {
                check = test.read();
                if (check == 'O' || check == 'o') {
                    check = test.read();
                    if (check == 'C' || check == 'c') {
                        check = test.read();
                        if (check == 'U' || check == 'u') {
                            check = test.read();
                            if (check == 'S' || check == 's') {
                                result = true;
                            }
                        }
                    }
                }
            }
            test.reset();
        } catch (Throwable e) {
            e.printStackTrace(System.err);
        }
        return result;
    }
}
