/******************************************************************************
* Module    :   Regular Expression Matching.
*
* Author    :   John W. M. Stevens.
******************************************************************************/

#include    "compiler.h"

#include    "sets.h"
#include    "regexp.h"
#include    "utils.h"

/*  Define sub expression list. */
typedef struct  {
    char        *Start;         /*  Start of sub-string in source.  */
    char        *End;           /*  End of sub-string in source.  */
    char        *SubStr;        /*  Pointer to sub-string.          */
} SUB_EXPR;
static  SUB_EXPR    SubExprs[MAX_SUB_EXPRS + 1];

static  CASE_CMP    CmpCase = CASE_SENSITIVE;

/*-----------------------------------------------------------------------------
| Routine   :   ExtSubStrs() --- Extract the sub-strings from the string to
|               match against.
-----------------------------------------------------------------------------*/

static
void    ExtSubStrs(void)
{
    register    int             i;
    register    int             StrLen;
    auto        char            *Str;
    extern      FILE            *ErrFile;

    /*  Extract all sub strings from the source string. */
    for (i = 0; i <= MAX_SUB_EXPRS; i++)
    {
        /*  Report run time error.  */
        if (SubExprs[i].Start == NULL || SubExprs[i].End == NULL)
            continue;
        else if (SubExprs[i].Start > SubExprs[i].End)
        {
            fprintf(ErrFile,
                    "%s %d : Error - sub expression #%d extraction failed.\n",
                    __FILE__,
                    __LINE__,
                    i);
            exit( 1 );
        }

        /*  Determine length of string. */
        for (StrLen = 0, Str = SubExprs[i].Start;
             Str != SubExprs[i].End;
             StrLen++, Str++
            )
             ;

        /*  Allocate space for the string.  */
        if ((Str = (char *) calloc(1, StrLen + 1)) == NULL)
        {
            fprintf(ErrFile,
                    "%s %d : Out of memory.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }

        /*  Copy the string to the new space.   */
        MemCopy(Str, SubExprs[i].Start, StrLen);
        Str[StrLen] = '\0';

        /*  Add string to sub expression string buffer. */
        SubExprs[i].SubStr = Str;
    }
}

/*-----------------------------------------------------------------------------
| Routine   :   ReStrCmp() --- Regular expression string compare.
|
| Inputs    :   Str     - Pointer to string to attempt to match.
|               ReStr   - Pointer to regular expression string.
|
| Returns   :   Returns zero for no match, non-zero for a match.
-----------------------------------------------------------------------------*/

static
int     ReStrCmp(char   *Str,
                 char   *ReStr)
{
    /*  While not end of regular expression string, compare characters. */
    while ( *ReStr )
    {
        /*  If the characters do not match, return no match.    */
        if (CmpCase == IGN_CASE)
        {
            /*  Compare characters regardless of case.  */
            if (tolower( *ReStr ) != tolower( *Str ))
                return( 0 );

            /*  They match.  Increment pointers to next characters. */
            ReStr++;
            Str++;
        }
        else if (*ReStr++ != *Str++)
            return( 0 );
    }
    return( 1 );
}

/*-----------------------------------------------------------------------------
| Routine   :   ReData() --- Evaluate a regular expression data node.
|
| Inputs    :   SrcStart    - Start of source string (for left anchor
|                             comparisons).
|               Str         - Pointer to string to attempt to match.
|               ReExpr      - Pointer to regular expression node.
| Outputs   :   Str         - Pointer to string after matched portion, or
|                             unchanged since input, if no match.
|               Srch        - Pointer to search description structure.
|
| Returns   :   Returns 1 for a match, 0 for no match.
-----------------------------------------------------------------------------*/

static
int     ReData(char         *SrcStart,
               char         **Str,
               REG_EXP_NODE *ReExpr)
{
    register    int     Match;
    extern      FILE    *ErrFile;

    /*  Attempt match.  */
    switch ( ReExpr->NodeType )
    {
    case DATA_STRING:
        /*  Compare string. */
        if ((Match = ReStrCmp(*Str, ReExpr->data.MatchStr)) != 0)
        {
            /*  Bump the source string pointer. */
            (*Str) += strlen( ReExpr->data.MatchStr );
        }
        break;
    case DATA_SET:
        /*  Is the current character a member of the set?  (Don't
        *   forget that negated sets are already negated at this
        *   point.)
        */
        if ((Match = InSet(ReExpr->data.CSet, (unsigned char) **Str)) != 0)
        {
            /*  Bump source string pointer. */
            (*Str)++;
        }
        break;
    case DATA_ANY:
        /*  Match any character except end of line. */
        if ((Match = **Str && **Str != '\n') != 0)
        {
            /*  Bump source string pointer. */
            (*Str)++;
        }
        break;
    case DATA_LEFT_ANCHOR:
        /*  Is this the start of the line?  */
        Match = *Str == SrcStart;
        break;
    case DATA_RIGHT_ANCHOR:
        /*  Is the current character an end of line character?  */
        Match = **Str == '\n' || **Str == '\0';
        break;
    default:
        fprintf(ErrFile,
                "%s %d : Error - illegal regular expression node type.\n",
                __FILE__,
                __LINE__);
        exit( 1 );
    }

    /*  Return match state. */
    return( Match );
}

/*-----------------------------------------------------------------------------
| Routine   :   ReOp() --- Execute a regular expression operator.
|
| Inputs    :   SrcStart    - Start of source string (for left anchor
|                             comparisons).
|               Str         - Pointer to string to attempt to match.
|               ReExpr      - Pointer to regular expression node.
| Outputs   :   Str         - Pointer to string after matched portion, or
|                             unchanged since input, if no match.
|               Cont        - Current regular expression node pointer.
|
| Returns   :   Returns 1 for a match, 0 for no match.
-----------------------------------------------------------------------------*/

static
int     ReOp(char           *SrcStart,
             char           **Str,
             REG_EXP_NODE   *ReExpr,
             REG_EXP_NODE   **Cont)
{
    register    int             i;
    register    int             Match;
    auto        char            *ChkPt;
    auto        REG_EXP_NODE    *Tmp;
    auto        REG_EXP_NODE    *Node;
    extern      FILE            *ErrFile;

    /*  Do operations.  */
    do
    {
        /*  Attempt match.  */
        switch ( ReExpr->NodeType )
        {
        case OP_OR:
            /*  Save check point for backtracking.  */
            ChkPt = *Str;

            /*  Attempt left branch first.  */
            if ((Match = ReOp(SrcStart,
                              Str,
                              ReExpr->Left,
                              &Node)) == 0)
            {
                /*  Attempt to match the right hand branch. */
                *Str = ChkPt;
                Match = ReOp(SrcStart, Str, ReExpr->Right, &Node);
            }

            /*  Get end of OR.  */
            ReExpr = Node;
            break;
        case DATA_SPAN:
            /*  Scan for at least the minimum number of characters. */
            for (Match = 1, i = 0; i < ReExpr->MinSpan; i++)
                if (**Str == '\0' || **Str == '\n')
                {
                    Match = 0;
                    break;
                }
            if (! Match)
                break;

            /*  Skip over all control node types to see if there is
            *   a terminating expression for this span.
            */
            for (Tmp = ReExpr->Right;
                 Tmp &&
                 (Tmp->NodeType == OP_L_PAREN   ||
                  Tmp->NodeType == OP_R_PAREN   ||
                  Tmp->NodeType == END_OR);
                 Tmp = Tmp->Right)
                ;

            /*  If there is no span terminating expression, then we
            *   automatically match to end of line.
            */
            if (Tmp == NULL)
            {
                /*  Scan to end of input string.    */
                for ( ; **Str; ++*Str)
                    ;

                /*  This is a definite match.   */
                Match = 1;
                break;
            }

            /*  We matched at least the minimum number.  Now attempt the
            *   maximum number.
            */
            for (ChkPt = *Str; i < ReExpr->MaxSpan; i++)
            {
                /*  Search forward until a match is found, or the
                *   max number of any character has been spanned.
                */
                *Str = ChkPt;
                if ((Match = ReOp(SrcStart,
                                  Str,
                                  ReExpr->Right,
                                  &Node)) != 0)
                {
                    break;
                }

                /*  We need to allow the routines to look at the
                *   NUL char, as there may be an end of line anchor
                *   in this expression.  HOWEVER, we MUST NOT go
                *   past end of line.
                */
                if ( *ChkPt )
                    ChkPt++;
                else
                    break;
            }

            /*  Check for a match.  */
            Match = i < ReExpr->MaxSpan;
            ReExpr = Node;
            break;
        case OP_ENUM:
            /*  Match the correct number of sub-expressions.    */
            for (i = 0; i < ReExpr->MaxSpan; i++)
            {
                /*  Attempt data match. */
                if ((Match = ReData(SrcStart,
                                    Str,
                                    ReExpr->Left)) == 0)
                {
                    /*  Have we at least the minimum?   */
                    if (i >= ReExpr->MinSpan)
                        Match = 1;
                    break;
                }
            }
            break;
        case OP_L_PAREN:
            /*  Save start of sub-expression.   */
            SubExprs[ReExpr->SubExprNo].Start = *Str;
            Match = 1;
            break;
        case OP_R_PAREN:
            /*  Save start of sub-expression.   */
            SubExprs[ReExpr->SubExprNo].End = *Str;
            Match = 1;
            break;
         default:
            Match = ReData(SrcStart, Str, ReExpr);
            break;
        }

        /*  Check for end.  */
        if (ReExpr != NULL)
            ReExpr = ReExpr->Right;

    } while (Match && ReExpr && ReExpr->NodeType != END_OR);

    /*  Return match.   */
    *Cont = ReExpr;
    return( Match );
}

/*-----------------------------------------------------------------------------
| Routine   :   ReMatch() --- Regular expression match.
|
| Inputs    :   Str         - Pointer to string to attempt to match.
|               Case        - Flag for case sensitivity.
|               ReExpr      - Pointer to regular expression tree root.
| Outputs   :   SubStrs     - Pointer to array of returned strings.
|               NoSubStrs   - Size of returned string array.
|
| Returns   :   Returns zero for no match, non-zero for a match.
-----------------------------------------------------------------------------*/

int     ReMatch(char            *Str,
                CASE_CMP        Case,
                REG_EXP_NODE    *ReExpr,
                char            ***SubStrs)
{
    register    int             i;
    register    int             Match;
    auto        char            *tp;
    auto        char            *Loop;
    auto        REG_EXP_NODE    *Node;

    static      char        *RetStrs[MAX_SUB_EXPRS];
    extern      FILE        *ErrFile;

    /*  Run list and free strings.  */
    for (i = 1; i < MAX_SUB_EXPRS; i++)
    {
        /*  Free previous sub strings.  */
        if (RetStrs[i] != NULL)
            free( RetStrs[i] );

        /*  Initialize the sub expression list. */
        RetStrs[i] = NULL;
        SubExprs[i].Start = SubExprs[i].End = SubExprs[i].SubStr = NULL;
    }

    /*  If the first node is a left anchor, no shifting is necesary.    */
    tp = Str;
    CmpCase = Case;
    if (ReExpr->NodeType == DATA_LEFT_ANCHOR)
        Match = ReOp(Str, &tp, ReExpr->Right, &Node);
    else
    {
        /*  Search forward. */
        for (Loop = Str; ; Loop++)
        {
            /*  Search for a match. */
            tp = Loop;
            if ((Match = ReOp(Str, &tp, ReExpr, &Node)) != 0)
                break;

            /*  Check for end of loop.  */
            if (*Loop == '\0' || *Loop == '\n')
                break;
        }
    }

    /*  Create sub string array. */
    if ( Match )
    {
        /*  Extract sub expression strings. */
        ExtSubStrs();
        for (i = 1; i < MAX_SUB_EXPRS; i++)
            RetStrs[i] = SubExprs[i].SubStr;

        /*  Return sub string array.    */
        *SubStrs = RetStrs;
    }

    /*  Return match true or false. */
    return( Match );
}
