/**++
 *   
 *   LICENSE
 *   -------
 *   
 *   Copyright (c) 2004 Renato Mancuso
 *   All rights reserved.
 *   
 *   Redistribution and use in source and binary forms, with or without modification, are 
 *   permitted provided that the following conditions are met:
 *   
 *   - Redistributions of source code must retain the above copyright notice, this list 
 *     of conditions and the following disclaimer.
 *   
 *   - Redistributions in binary form must reproduce the above copyright notice, this list
 *     of conditions and the following disclaimer in the documentation and/or other materials 
 *     provided with the distribution.
 *   
 *   - Neither the name of Renato Mancuso nor the names of its contributors may be used to 
 *     endorse or promote products derived from this software without specific prior written 
 *     permission.
 *   
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS 
 *   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
 *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
 *   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 *   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 
 *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 
 *   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *   
--**/


#include "ASLineParser.hpp"
#include "ParserErrors.hpp"
#include "ParserWarnings.hpp"
#include "StringUtils.hpp"
#include "StringTokenizer.hpp"
#include "Regex.hpp"

#include <cstdlib>
#include <cstring>
#include <cassert>

namespace OpenEMBL
{
namespace Phoenix
{

    ASLineParser::ASLineParser(
        IParserCtx*               pParserCtx,
        IItemHandler<ASLine>*     pItemHandler)
        : ItemParserImpl<ASLine>(pParserCtx, pItemHandler)        
    {
    }

    METHODIMP ASLineParser::parse()
    {
        long   theLineNumber = currentLineNumber();
        LPCSTR theLine       = currentLine();

        if (NULL == theLine)
        {
            logError(theLineNumber,
                     ERROR_EOF_FOUND,
                     NULL );
            return;
        }

        if (!isASLine(theLine))
        {
            logError(theLineNumber,
                     ERROR_INVALID_AS_LINE,
                     "line does not start with AS");
            nextLine();
            return;
        }

        //
        // continue processing lines
        //
        while (1)
        {
            processLine(getLineContent(currentLine()), currentLineNumber());
            nextLine();

            if (NULL == currentLine())
                break;

            if (!isASLine(currentLine()))
                break;
        } 
    }

    bool ASLineParser::isASLine(char const * theLine)
    {
        assert(NULL != theLine);
        return 0 == strncmp(theLine, "AS   ", 5);
    }

    std::string ASLineParser::getLineContent(std::string const & theLine)
    {
        return theLine.substr(5);
    }

    void ASLineParser::processLine(const std::string & theLine, long theLineNumber)
    {
        //===========================================================================
        //  line format is:
        //
        //  AS   localrange primaryacc [sourcerange] [C]
        //
        //      primaryacc must either be an EMBL/GenBank/DDBJ acc# (sequence version is mandatory) or a
        //      Trace archive ID
        //
        //      sourcerange is mandatory if primaryacc is an EMBL/GenBank/DDBJ acc#, but can be
        //      substituted by the single token 'not_available' if primaryacc is a TRACE archive ID
        //
        //  ex: (output will not be in columns!)
        //
        //                 1         2         3         4         5         6         7         8
        //        12345678901234567890123456789012345678901234567890123456789012345678901234567890
        //          5            21         1          29                 1       21 + 2
        //        <---><------------------->.<--------------------------->.<--------------------->
        //        AH   TPA_SPAN              PRIMARY_IDENTIFIER            PRIMARY_SPAN   COMP
        //        AS   1-34                  AB00123.56                    301-334        C
        //        AS   40-69                 TI12345679                    not_available
        //
        //===========================================================================

        StringTokenizer theTokenizer(" \t");
        StringList      theTokens;

        theTokenizer.tokenize(theLine, theTokens);

        ASLine theField;

        if (!getTPASpan(theTokens, theLineNumber, theField))
            return;

        if (!getPrimaryIdentifier(theTokens, theLineNumber, theField))
            return;

        if (!getPrimarySpan(theTokens, theLineNumber, theField))
            return;

        if (!getComplement(theTokens, theLineNumber, theField))
            return;

        if (theTokens.size() > 4)
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_INVALID_CHARACTERS_AT_END_OF_LINE,
                     theTokens[4].c_str());

            return;
        }

        notifyParsed(theLineNumber, theField);
    }

    bool ASLineParser::getTPASpan(
                StringList const &  theTokens, 
                long                theLineNumber, 
                ASLine &            theField)
    {
        if (theTokens.size() < 1)
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_MISSING_TPA_SPAN,
                     NULL);

            return false;
        }

        //
        // Regex is: ^(\d+)-(\d+)$
        //
        static Regex theRegex("^(\\d+)-(\\d+)$");
        Match       theMatches;

        if (!regexSearch(theTokens[0], theMatches, theRegex))
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_BAD_TPA_SPAN_FORMAT,
                     theTokens[0].c_str());
            return false;
        }

        theField.TPASpan.First = atol(theMatches.str(1).c_str());
        theField.TPASpan.Last  = atol(theMatches.str(2).c_str());
        
        return true;
    }

    bool ASLineParser::getPrimaryIdentifier(
                StringList const &  theTokens, 
                long                theLineNumber, 
                ASLine &            theField)
    {
        if (theTokens.size() < 2)
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_MISSING_PRIMARY_IDENTIFIER,
                     NULL);

            return false;
        }

        theField.PrimaryIdentifier = theTokens[1];

        return true;
    }

    bool ASLineParser::getPrimarySpan(
                StringList const &  theTokens, 
                long                theLineNumber, 
                ASLine &            theField)
    {
        if (theTokens.size() < 3)
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_MISSING_PRIMARY_SPAN,
                     NULL);

            return false;
        }

        if (0 == strCaseCmp("not_available", theTokens[2].c_str()))
            return true;

        //
        // Regex is: ^(\d+)-(\d+)$
        //
        static Regex theRegex("^(\\d+)-(\\d+)$");
        Match       theMatches;

        if (!regexSearch(theTokens[0], theMatches, theRegex))
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_BAD_PRIMARY_SPAN_FORMAT,
                     theTokens[0].c_str());
            return false;
        }

        theField.PrimarySpan.First = atol(theMatches.str(1).c_str());
        theField.PrimarySpan.Last  = atol(theMatches.str(2).c_str());
        
        return true;

    }

    bool ASLineParser::getComplement(
                StringList const &  theTokens, 
                long                theLineNumber, 
                ASLine &            theField)
    {
        if (theTokens.size() < 4)
            return true;

        if (0 != strCaseCmp("C", theTokens[3].c_str()))
        {
            logError(theLineNumber,
                     ERROR_AS_LINE_INVALID_COMPLEMENT_FORMAT,
                     theTokens[3].c_str());

            return false;
        }

        theField.Complement = true;

        return true;
    }
    


}
}
