/**++
 *   
 * 2006-09-15 Christian Quast (CQ) - adaptation to the new EMBL (r87+) ID line format
 *
 *
 *   LICENSE
 *   -------
 *   
 *   Copyright (c) 2004 Renato Mancuso
 *   All rights reserved.
 *   
 *   Redistribution and use in source and binary forms, with or without modification, are 
 *   permitted provided that the following conditions are met:
 *   
 *   - Redistributions of source code must retain the above copyright notice, this list 
 *     of conditions and the following disclaimer.
 *   
 *   - Redistributions in binary form must reproduce the above copyright notice, this list
 *     of conditions and the following disclaimer in the documentation and/or other materials 
 *     provided with the distribution.
 *   
 *   - Neither the name of Renato Mancuso nor the names of its contributors may be used to 
 *     endorse or promote products derived from this software without specific prior written 
 *     permission.
 *   
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS 
 *   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
 *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
 *   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 *   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 
 *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 
 *   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *   
--**/


#include "IDLineParser.hpp"
#include "ParserErrors.hpp"
#include "ParserWarnings.hpp"
#include "StringTokenizer.hpp"
#include "ListReader.hpp"
#include "StringUtils.hpp"
#include "Regex.hpp"

#include <cstdlib>
#include <cassert>
#include <iostream>

#include <algorithm>

namespace OpenEMBL
{
namespace Phoenix
{

IDLineParser::IDLineParser(
    IParserCtx*             pParserCtx,
    IItemHandler<IDLine>*   pLineHandler)
        : ItemParserImpl<IDLine>(pParserCtx, pLineHandler)
{}

METHODIMP IDLineParser::parse()
{
    long theLineNumber = currentLineNumber();

    if (NULL == currentLine()) {
        logError(theLineNumber,
                 ERROR_EOF_FOUND,
                 NULL );
        return;
    }

    std::string theLine = currentLine();
    nextLine();

    processLine(theLine, theLineNumber);
}

void IDLineParser::processLine(const std::string & theLine, long theLineNumber)
{
    // EMBL ID line format (before release 97) used to be:
    // 
    //      ID <EntryName> <DataClass>; <Moltype>; <Division>; <SequenceLength> BP. 
    // 
    //      ex: ID   AA09359    standard; unassigned DNA; INV; 306 BP.
    //
    // ---------------------------------------------------------------
    // ID line format as of EMBL release 87 (June 2006) changes to:
    // 
    //      ID <PrimaryAccession>; SV <Version>; <Topology>; <MolType>; <DataClass>; <Division> <Length> BP.
    // 
    //      ex: ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.
    //
    ListReader      theTokenizer(this, ';', '.');
    StringList      theTokens;

    theTokenizer.readList(theLineNumber, trimRight(theLine), theTokens);

    //
    // The order of functions to call is important because the 'old' function getMoltype 
    // is also setting the topology member (Circular). Therefore, it must be called 
    // before calling getTopology otherwise it overwrites Circular (always setting 
    // it to false).
    // 

    // *** added by CQ -- adaptation to the new EMBL ID line format -- 20060915
    if (theTokens.size() == 4) {
        IDLine theField;

        if (!getEntrynameDataclass(theTokens[0], theLineNumber, theField))
            return;
        if (!getMoltype(theTokens[1], theLineNumber, theField))
            return;
        if (!getDivision(theTokens[2], theLineNumber, theField))
            return;
        if (!getSequenceLength(theTokens[3], theLineNumber, theField))
            return;

        notifyParsed(theLineNumber, theField);
    } else if (theTokens.size() == 7) {
        IDLine theField;

        if (!getEntryName(theTokens[0], theLineNumber, theField))
            return;
        if (!getSequenceVersion(theTokens[1], theLineNumber, theField))
            return;
        if (!getMoltype(theTokens[3], theLineNumber, theField))
            return;
        if (!getTopology(theTokens[2], theLineNumber, theField))
            return;
        if (!getDataClass(theTokens[4], theLineNumber, theField))
            return;
        if (!getDivision(theTokens[5], theLineNumber, theField))
            return;
        if (!getSequenceLength(theTokens[6], theLineNumber, theField))
            return;

        notifyParsed(theLineNumber, theField);
    } else {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 theLine.c_str());
        return;
    }
}

bool IDLineParser::getEntrynameDataclass(String const & theToken, long theLineNumber, IDLine & theField)
{
    StringTokenizer theTokenizer(" \t");
    StringList      theTokens;

    theTokenizer.tokenize(theToken, theTokens);

    if (theTokens.size() != 3) {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 "missing entryname or dataclass");
        return false;
    }

    if (theTokens[0] != "ID") {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 theToken.c_str());
        return false;
    }

    theField.EntryName = theTokens[1];
    theField.DataClass = theTokens[2];

    return true;
}

bool IDLineParser::getMoltype(String const & theToken, long theLineNumber, IDLine & theField)
{
    //
    //  Regex is: ^(?:circular\s+){0,1}(.*)
    //
    static Regex theRegex("^(circular\\s+){0,1}(.*)", PCRE_CASELESS );

    Match theMatches;
    std::string   theSubField = trim(theToken);

    if (!regexSearch(theSubField, theMatches, theRegex)) {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 "missing moltype");
        return false;
    }

    theField.Circular = theMatches[1].matched;
    theField.MolType  = trim(theMatches.str(2));

    return true;
}

bool IDLineParser::getDivision(String const & theToken, long /*theLineNumber*/, IDLine & theField)
{
    theField.Division = trim(theToken);
    return true;
}

bool IDLineParser::getSequenceLength(String const & theToken, long theLineNumber, IDLine & theField)
{
    //
    // Regex is: ^(\d+)\s+BP
    //
    static Regex theRegex("^(\\d+)\\s+BP",
                          PCRE_CASELESS );

    Match theMatches;
    std::string   theSubField = trim(theToken);

    if (!regexSearch(theSubField, theMatches, theRegex)) {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 "missing sequence length");
        return false;
    }

    theField.SequenceLength = atoi(theMatches.str(1).c_str());
    return true;
}


bool IDLineParser::getEntryName( String const & theToken, long theLineNumber, IDLine & theField )
{
    // *** added by CQ -- adaptation to the new EMBL ID line format -- 20060915
    StringTokenizer theTokenizer(" \t");
    StringList      theTokens;

    theTokenizer.tokenize(theToken, theTokens);

    if (theTokens.size() != 2) {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 "missing entryname");
        return false;
    }

    if (theTokens[0] != "ID") {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 theToken.c_str());
        return false;
    }

    theField.EntryName = theTokens[1];

    return true;
}

bool IDLineParser::getDataClass(String const & theToken, long /*theLineNumber*/, IDLine & theField )
{
    theField.DataClass = theToken;
    return true;
}

bool IDLineParser::getSequenceVersion(String const & theToken, long theLineNumber, IDLine & theField )
{
    // *** added by CQ -- adaptation to the new EMBL ID line format -- 20060915
    // *** modified by RM -- 20060918

    //
    // Regex is: ^SV\s+(\d+)
    //
    static Regex theRegex( "^SV\\s+(\\d+)" );

    Match theMatches;

    std::string theSubField = trim( theToken );

    if (!regexSearch(theSubField, theMatches, theRegex)) {
        logError(theLineNumber,
                 ERROR_INVALID_ID_LINE,
                 "missing Sequence Version");

        return false;
    }

    theField.SequenceVersion = atoi( theMatches.str(1).c_str() );

    return true;	
}

bool IDLineParser::getTopology(String const & theToken, long theLineNumber, IDLine & theField)
{
    // *** added by CQ -- adaptation to the new EMBL ID line format -- 20060915
    // *** modified by RM -- 20060918
    // 
    String token = trim( theToken );    

	theField.Circular = ( 0 == strCaseCmp( token.c_str(), "circular" ) );

    return true;
}

}
}
