forked from mirrors/gecko-dev
		
	Update to ICU 69.1 by running "update-icu.sh" with "maint/maint-69" as the target. Differential Revision: https://phabricator.services.mozilla.com/D116968
		
			
				
	
	
		
			1795 lines
		
	
	
	
		
			63 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1795 lines
		
	
	
	
		
			63 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
// © 2016 and later: Unicode, Inc. and others.
 | 
						|
// License & terms of use: http://www.unicode.org/copyright.html
 | 
						|
/*
 | 
						|
**********************************************************************
 | 
						|
*   Copyright (c) 2002-2016, International Business Machines
 | 
						|
*   Corporation and others.  All Rights Reserved.
 | 
						|
**********************************************************************
 | 
						|
*/
 | 
						|
//
 | 
						|
//  rbbitblb.cpp
 | 
						|
//
 | 
						|
 | 
						|
 | 
						|
#include "unicode/utypes.h"
 | 
						|
 | 
						|
#if !UCONFIG_NO_BREAK_ITERATION
 | 
						|
 | 
						|
#include "unicode/unistr.h"
 | 
						|
#include "rbbitblb.h"
 | 
						|
#include "rbbirb.h"
 | 
						|
#include "rbbiscan.h"
 | 
						|
#include "rbbisetb.h"
 | 
						|
#include "rbbidata.h"
 | 
						|
#include "cstring.h"
 | 
						|
#include "uassert.h"
 | 
						|
#include "uvectr32.h"
 | 
						|
#include "cmemory.h"
 | 
						|
 | 
						|
U_NAMESPACE_BEGIN
 | 
						|
 | 
						|
const int32_t kMaxStateFor8BitsTable = 255;
 | 
						|
 | 
						|
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
 | 
						|
        fRB(rb),
 | 
						|
        fTree(*rootNode),
 | 
						|
        fStatus(&status),
 | 
						|
        fDStates(nullptr),
 | 
						|
        fSafeTable(nullptr) {
 | 
						|
    if (U_FAILURE(status)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    // fDStates is UVector<RBBIStateDescriptor *>
 | 
						|
    fDStates = new UVector(status);
 | 
						|
    if (U_SUCCESS(status) && fDStates == nullptr ) {
 | 
						|
        status = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
RBBITableBuilder::~RBBITableBuilder() {
 | 
						|
    int i;
 | 
						|
    for (i=0; i<fDStates->size(); i++) {
 | 
						|
        delete (RBBIStateDescriptor *)fDStates->elementAt(i);
 | 
						|
    }
 | 
						|
    delete fDStates;
 | 
						|
    delete fSafeTable;
 | 
						|
    delete fLookAheadRuleMap;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   RBBITableBuilder::buildForwardTable  -  This is the main function for building
 | 
						|
//                               the DFA state transition table from the RBBI rules parse tree.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void  RBBITableBuilder::buildForwardTable() {
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    // If there were no rules, just return.  This situation can easily arise
 | 
						|
    //   for the reverse rules.
 | 
						|
    if (fTree==NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    // Walk through the tree, replacing any references to $variables with a copy of the
 | 
						|
    //   parse tree for the substition expression.
 | 
						|
    //
 | 
						|
    fTree = fTree->flattenVariables();
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
 | 
						|
        RBBIDebugPuts("\nParse tree after flattening variable references.");
 | 
						|
        RBBINode::printTree(fTree, TRUE);
 | 
						|
    }
 | 
						|
#endif
 | 
						|
 | 
						|
    //
 | 
						|
    // If the rules contained any references to {bof} 
 | 
						|
    //   add a {bof} <cat> <former root of tree> to the
 | 
						|
    //   tree.  Means that all matches must start out with the 
 | 
						|
    //   {bof} fake character.
 | 
						|
    // 
 | 
						|
    if (fRB->fSetBuilder->sawBOF()) {
 | 
						|
        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
 | 
						|
        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
 | 
						|
        // Delete and exit if memory allocation failed.
 | 
						|
        if (bofTop == NULL || bofLeaf == NULL) {
 | 
						|
            *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
            delete bofTop;
 | 
						|
            delete bofLeaf;
 | 
						|
            return;
 | 
						|
        }
 | 
						|
        bofTop->fLeftChild  = bofLeaf;
 | 
						|
        bofTop->fRightChild = fTree;
 | 
						|
        bofLeaf->fParent    = bofTop;
 | 
						|
        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
 | 
						|
        fTree               = bofTop;
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    // Add a unique right-end marker to the expression.
 | 
						|
    //   Appears as a cat-node, left child being the original tree,
 | 
						|
    //   right child being the end marker.
 | 
						|
    //
 | 
						|
    RBBINode *cn = new RBBINode(RBBINode::opCat);
 | 
						|
    // Exit if memory allocation failed.
 | 
						|
    if (cn == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    cn->fLeftChild = fTree;
 | 
						|
    fTree->fParent = cn;
 | 
						|
    RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
 | 
						|
    // Delete and exit if memory allocation failed.
 | 
						|
    if (cn->fRightChild == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
        delete cn;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    cn->fRightChild->fParent = cn;
 | 
						|
    fTree = cn;
 | 
						|
 | 
						|
    //
 | 
						|
    //  Replace all references to UnicodeSets with the tree for the equivalent
 | 
						|
    //      expression.
 | 
						|
    //
 | 
						|
    fTree->flattenSets();
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
 | 
						|
        RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
 | 
						|
        RBBINode::printTree(fTree, TRUE);
 | 
						|
    }
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
    //
 | 
						|
    // calculate the functions nullable, firstpos, lastpos and followpos on
 | 
						|
    // nodes in the parse tree.
 | 
						|
    //    See the algorithm description in Aho.
 | 
						|
    //    Understanding how this works by looking at the code alone will be
 | 
						|
    //       nearly impossible.
 | 
						|
    //
 | 
						|
    calcNullable(fTree);
 | 
						|
    calcFirstPos(fTree);
 | 
						|
    calcLastPos(fTree);
 | 
						|
    calcFollowPos(fTree);
 | 
						|
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
 | 
						|
        RBBIDebugPuts("\n");
 | 
						|
        printPosSets(fTree);
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    //  For "chained" rules, modify the followPos sets
 | 
						|
    //
 | 
						|
    if (fRB->fChainRules) {
 | 
						|
        calcChainedFollowPos(fTree, endMarkerNode);
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    //  BOF (start of input) test fixup.
 | 
						|
    //
 | 
						|
    if (fRB->fSetBuilder->sawBOF()) {
 | 
						|
        bofFixup();
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    // Build the DFA state transition tables.
 | 
						|
    //
 | 
						|
    buildStateTable();
 | 
						|
    mapLookAheadRules();
 | 
						|
    flagAcceptingStates();
 | 
						|
    flagLookAheadStates();
 | 
						|
    flagTaggedStates();
 | 
						|
 | 
						|
    //
 | 
						|
    // Update the global table of rule status {tag} values
 | 
						|
    // The rule builder has a global vector of status values that are common
 | 
						|
    //    for all tables.  Merge the ones from this table into the global set.
 | 
						|
    //
 | 
						|
    mergeRuleStatusVals();
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::calcNullable(RBBINode *n) {
 | 
						|
    if (n == NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    if (n->fType == RBBINode::setRef ||
 | 
						|
        n->fType == RBBINode::endMark ) {
 | 
						|
        // These are non-empty leaf node types.
 | 
						|
        n->fNullable = FALSE;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
 | 
						|
        // Lookahead marker node.  It's a leaf, so no recursion on children.
 | 
						|
        // It's nullable because it does not match any literal text from the input stream.
 | 
						|
        n->fNullable = TRUE;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
    // The node is not a leaf.
 | 
						|
    //  Calculate nullable on its children.
 | 
						|
    calcNullable(n->fLeftChild);
 | 
						|
    calcNullable(n->fRightChild);
 | 
						|
 | 
						|
    // Apply functions from table 3.40 in Aho
 | 
						|
    if (n->fType == RBBINode::opOr) {
 | 
						|
        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opCat) {
 | 
						|
        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
 | 
						|
        n->fNullable = TRUE;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        n->fNullable = FALSE;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::calcFirstPos(RBBINode *n) {
 | 
						|
    if (n == NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    if (n->fType == RBBINode::leafChar  ||
 | 
						|
        n->fType == RBBINode::endMark   ||
 | 
						|
        n->fType == RBBINode::lookAhead ||
 | 
						|
        n->fType == RBBINode::tag) {
 | 
						|
        // These are non-empty leaf node types.
 | 
						|
        // Note: In order to maintain the sort invariant on the set,
 | 
						|
        // this function should only be called on a node whose set is
 | 
						|
        // empty to start with.
 | 
						|
        n->fFirstPosSet->addElement(n, *fStatus);
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    // The node is not a leaf.
 | 
						|
    //  Calculate firstPos on its children.
 | 
						|
    calcFirstPos(n->fLeftChild);
 | 
						|
    calcFirstPos(n->fRightChild);
 | 
						|
 | 
						|
    // Apply functions from table 3.40 in Aho
 | 
						|
    if (n->fType == RBBINode::opOr) {
 | 
						|
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
 | 
						|
        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opCat) {
 | 
						|
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
 | 
						|
        if (n->fLeftChild->fNullable) {
 | 
						|
            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
 | 
						|
        }
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opStar ||
 | 
						|
             n->fType == RBBINode::opQuestion ||
 | 
						|
             n->fType == RBBINode::opPlus) {
 | 
						|
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::calcLastPos(RBBINode *n) {
 | 
						|
    if (n == NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    if (n->fType == RBBINode::leafChar  ||
 | 
						|
        n->fType == RBBINode::endMark   ||
 | 
						|
        n->fType == RBBINode::lookAhead ||
 | 
						|
        n->fType == RBBINode::tag) {
 | 
						|
        // These are non-empty leaf node types.
 | 
						|
        // Note: In order to maintain the sort invariant on the set,
 | 
						|
        // this function should only be called on a node whose set is
 | 
						|
        // empty to start with.
 | 
						|
        n->fLastPosSet->addElement(n, *fStatus);
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    // The node is not a leaf.
 | 
						|
    //  Calculate lastPos on its children.
 | 
						|
    calcLastPos(n->fLeftChild);
 | 
						|
    calcLastPos(n->fRightChild);
 | 
						|
 | 
						|
    // Apply functions from table 3.40 in Aho
 | 
						|
    if (n->fType == RBBINode::opOr) {
 | 
						|
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
 | 
						|
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opCat) {
 | 
						|
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
 | 
						|
        if (n->fRightChild->fNullable) {
 | 
						|
            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
 | 
						|
        }
 | 
						|
    }
 | 
						|
    else if (n->fType == RBBINode::opStar     ||
 | 
						|
             n->fType == RBBINode::opQuestion ||
 | 
						|
             n->fType == RBBINode::opPlus) {
 | 
						|
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::calcFollowPos(RBBINode *n) {
 | 
						|
    if (n == NULL ||
 | 
						|
        n->fType == RBBINode::leafChar ||
 | 
						|
        n->fType == RBBINode::endMark) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    calcFollowPos(n->fLeftChild);
 | 
						|
    calcFollowPos(n->fRightChild);
 | 
						|
 | 
						|
    // Aho rule #1
 | 
						|
    if (n->fType == RBBINode::opCat) {
 | 
						|
        RBBINode *i;   // is 'i' in Aho's description
 | 
						|
        uint32_t     ix;
 | 
						|
 | 
						|
        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
 | 
						|
 | 
						|
        for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
 | 
						|
            i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
 | 
						|
            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // Aho rule #2
 | 
						|
    if (n->fType == RBBINode::opStar ||
 | 
						|
        n->fType == RBBINode::opPlus) {
 | 
						|
        RBBINode   *i;  // again, n and i are the names from Aho's description.
 | 
						|
        uint32_t    ix;
 | 
						|
 | 
						|
        for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
 | 
						|
            i = (RBBINode *)n->fLastPosSet->elementAt(ix);
 | 
						|
            setAdd(i->fFollowPos, n->fFirstPosSet);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
 | 
						|
//                        as roots of a rule to a destination vector.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
 | 
						|
    if (node == NULL || U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    if (node->fRuleRoot) {
 | 
						|
        dest->addElement(node, *fStatus);
 | 
						|
        // Note: rules cannot nest. If we found a rule start node,
 | 
						|
        //       no child node can also be a start node.
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    addRuleRootNodes(dest, node->fLeftChild);
 | 
						|
    addRuleRootNodes(dest, node->fRightChild);
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   calcChainedFollowPos.    Modify the previously calculated followPos sets
 | 
						|
//                            to implement rule chaining.  NOT described by Aho
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
 | 
						|
 | 
						|
    UVector         leafNodes(*fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    // get a list all leaf nodes
 | 
						|
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    // Collect all leaf nodes that can start matches for rules
 | 
						|
    // with inbound chaining enabled, which is the union of the 
 | 
						|
    // firstPosition sets from each of the rule root nodes.
 | 
						|
    
 | 
						|
    UVector ruleRootNodes(*fStatus);
 | 
						|
    addRuleRootNodes(&ruleRootNodes, tree);
 | 
						|
 | 
						|
    UVector matchStartNodes(*fStatus);
 | 
						|
    for (int j=0; j<ruleRootNodes.size(); ++j) {
 | 
						|
        RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
 | 
						|
        if (node->fChainIn) {
 | 
						|
            setAdd(&matchStartNodes, node->fFirstPosSet);
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    int32_t  endNodeIx;
 | 
						|
    int32_t  startNodeIx;
 | 
						|
 | 
						|
    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
 | 
						|
        RBBINode *endNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
 | 
						|
 | 
						|
        // Identify leaf nodes that correspond to overall rule match positions.
 | 
						|
        // These include the endMarkNode in their followPos sets.
 | 
						|
        //
 | 
						|
        // Note: do not consider other end marker nodes, those that are added to
 | 
						|
        //       look-ahead rules. These can't chain; a match immediately stops
 | 
						|
        //       further matching. This leaves exactly one end marker node, the one
 | 
						|
        //       at the end of the complete tree.
 | 
						|
 | 
						|
        if (!endNode->fFollowPos->contains(endMarkNode)) {
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        // We've got a node that can end a match.
 | 
						|
 | 
						|
        // !!LBCMNoChain implementation:  If this node's val correspond to
 | 
						|
        // the Line Break $CM char class, don't chain from it.
 | 
						|
        // TODO:  Remove this. !!LBCMNoChain is deprecated, and is not used
 | 
						|
        //        by any of the standard ICU rules.
 | 
						|
        if (fRB->fLBCMNoChain) {
 | 
						|
            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
 | 
						|
            if (c != -1) {
 | 
						|
                // c == -1 occurs with sets containing only the {eof} marker string.
 | 
						|
                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
 | 
						|
                if (cLBProp == U_LB_COMBINING_MARK) {
 | 
						|
                    continue;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        // Now iterate over the nodes that can start a match, looking for ones
 | 
						|
        //   with the same char class as our ending node.
 | 
						|
        RBBINode *startNode;
 | 
						|
        for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
 | 
						|
            startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
 | 
						|
            if (startNode->fType != RBBINode::leafChar) {
 | 
						|
                continue;
 | 
						|
            }
 | 
						|
 | 
						|
            if (endNode->fVal == startNode->fVal) {
 | 
						|
                // The end val (character class) of one possible match is the
 | 
						|
                //   same as the start of another.
 | 
						|
 | 
						|
                // Add all nodes from the followPos of the start node to the
 | 
						|
                //  followPos set of the end node, which will have the effect of
 | 
						|
                //  letting matches transition from a match state at endNode
 | 
						|
                //  to the second char of a match starting with startNode.
 | 
						|
                setAdd(endNode->fFollowPos, startNode->fFollowPos);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
 | 
						|
//                Do an swizzle similar to chaining, modifying the followPos set of
 | 
						|
//                the bofNode to include the followPos nodes from other {bot} nodes
 | 
						|
//                scattered through the tree.
 | 
						|
//
 | 
						|
//                This function has much in common with calcChainedFollowPos().
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::bofFixup() {
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    //   The parse tree looks like this ...
 | 
						|
    //         fTree root  --->       <cat>
 | 
						|
    //                               /     \       .
 | 
						|
    //                            <cat>   <#end node>
 | 
						|
    //                           /     \  .
 | 
						|
    //                     <bofNode>   rest
 | 
						|
    //                               of tree
 | 
						|
    //
 | 
						|
    //    We will be adding things to the followPos set of the <bofNode>
 | 
						|
    //
 | 
						|
    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
 | 
						|
    U_ASSERT(bofNode->fType == RBBINode::leafChar);
 | 
						|
    U_ASSERT(bofNode->fVal == 2);
 | 
						|
 | 
						|
    // Get all nodes that can be the start a match of the user-written rules
 | 
						|
    //  (excluding the fake bofNode)
 | 
						|
    //  We want the nodes that can start a match in the
 | 
						|
    //     part labeled "rest of tree"
 | 
						|
    // 
 | 
						|
    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
 | 
						|
 | 
						|
    RBBINode *startNode;
 | 
						|
    int       startNodeIx;
 | 
						|
    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
 | 
						|
        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
 | 
						|
        if (startNode->fType != RBBINode::leafChar) {
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        if (startNode->fVal == bofNode->fVal) {
 | 
						|
            //  We found a leaf node corresponding to a {bof} that was
 | 
						|
            //    explicitly written into a rule.
 | 
						|
            //  Add everything from the followPos set of this node to the
 | 
						|
            //    followPos set of the fake bofNode at the start of the tree.
 | 
						|
            //  
 | 
						|
            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   buildStateTable()    Determine the set of runtime DFA states and the
 | 
						|
//                        transition tables for these states, by the algorithm
 | 
						|
//                        of fig. 3.44 in Aho.
 | 
						|
//
 | 
						|
//                        Most of the comments are quotes of Aho's psuedo-code.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::buildStateTable() {
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    RBBIStateDescriptor *failState;
 | 
						|
    // Set it to NULL to avoid uninitialized warning
 | 
						|
    RBBIStateDescriptor *initialState = NULL; 
 | 
						|
    //
 | 
						|
    // Add a dummy state 0 - the stop state.  Not from Aho.
 | 
						|
    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
 | 
						|
    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
 | 
						|
    if (failState == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
    failState->fPositions = new UVector(*fStatus);
 | 
						|
    if (failState->fPositions == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
    }
 | 
						|
    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
    fDStates->addElement(failState, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
 | 
						|
    // initially, the only unmarked state in Dstates is firstpos(root),
 | 
						|
    //       where toot is the root of the syntax tree for (r)#;
 | 
						|
    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
 | 
						|
    if (initialState == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
    }
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
    initialState->fPositions = new UVector(*fStatus);
 | 
						|
    if (initialState->fPositions == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
    }
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
    setAdd(initialState->fPositions, fTree->fFirstPosSet);
 | 
						|
    fDStates->addElement(initialState, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        goto ExitBuildSTdeleteall;
 | 
						|
    }
 | 
						|
 | 
						|
    // while there is an unmarked state T in Dstates do begin
 | 
						|
    for (;;) {
 | 
						|
        RBBIStateDescriptor *T = NULL;
 | 
						|
        int32_t              tx;
 | 
						|
        for (tx=1; tx<fDStates->size(); tx++) {
 | 
						|
            RBBIStateDescriptor *temp;
 | 
						|
            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
 | 
						|
            if (temp->fMarked == FALSE) {
 | 
						|
                T = temp;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
        if (T == NULL) {
 | 
						|
            break;
 | 
						|
        }
 | 
						|
 | 
						|
        // mark T;
 | 
						|
        T->fMarked = TRUE;
 | 
						|
 | 
						|
        // for each input symbol a do begin
 | 
						|
        int32_t  a;
 | 
						|
        for (a = 1; a<=lastInputSymbol; a++) {
 | 
						|
            // let U be the set of positions that are in followpos(p)
 | 
						|
            //    for some position p in T
 | 
						|
            //    such that the symbol at position p is a;
 | 
						|
            UVector    *U = NULL;
 | 
						|
            RBBINode   *p;
 | 
						|
            int32_t     px;
 | 
						|
            for (px=0; px<T->fPositions->size(); px++) {
 | 
						|
                p = (RBBINode *)T->fPositions->elementAt(px);
 | 
						|
                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
 | 
						|
                    if (U == NULL) {
 | 
						|
                        U = new UVector(*fStatus);
 | 
						|
                        if (U == NULL) {
 | 
						|
                        	*fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
                        	goto ExitBuildSTdeleteall;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                    setAdd(U, p->fFollowPos);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            // if U is not empty and not in DStates then
 | 
						|
            int32_t  ux = 0;
 | 
						|
            UBool    UinDstates = FALSE;
 | 
						|
            if (U != NULL) {
 | 
						|
                U_ASSERT(U->size() > 0);
 | 
						|
                int  ix;
 | 
						|
                for (ix=0; ix<fDStates->size(); ix++) {
 | 
						|
                    RBBIStateDescriptor *temp2;
 | 
						|
                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
 | 
						|
                    if (setEquals(U, temp2->fPositions)) {
 | 
						|
                        delete U;
 | 
						|
                        U  = temp2->fPositions;
 | 
						|
                        ux = ix;
 | 
						|
                        UinDstates = TRUE;
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                // Add U as an unmarked state to Dstates
 | 
						|
                if (!UinDstates)
 | 
						|
                {
 | 
						|
                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
 | 
						|
                    if (newState == NULL) {
 | 
						|
                    	*fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
                    }
 | 
						|
                    if (U_FAILURE(*fStatus)) {
 | 
						|
                        goto ExitBuildSTdeleteall;
 | 
						|
                    }
 | 
						|
                    newState->fPositions = U;
 | 
						|
                    fDStates->addElement(newState, *fStatus);
 | 
						|
                    if (U_FAILURE(*fStatus)) {
 | 
						|
                        return;
 | 
						|
                    }
 | 
						|
                    ux = fDStates->size()-1;
 | 
						|
                }
 | 
						|
 | 
						|
                // Dtran[T, a] := U;
 | 
						|
                T->fDtran->setElementAt(ux, a);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    return;
 | 
						|
    // delete local pointers only if error occured.
 | 
						|
ExitBuildSTdeleteall:
 | 
						|
    delete initialState;
 | 
						|
    delete failState;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 * mapLookAheadRules
 | 
						|
 *
 | 
						|
 */
 | 
						|
void RBBITableBuilder::mapLookAheadRules() {
 | 
						|
    fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
 | 
						|
    if (fLookAheadRuleMap == nullptr) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
    }
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
 | 
						|
 | 
						|
    for (int32_t n=0; n<fDStates->size(); n++) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
        int32_t laSlotForState = 0;
 | 
						|
 | 
						|
        // Establish the look-ahead slot for this state, if the state covers
 | 
						|
        // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
 | 
						|
 | 
						|
        // If any of the look-ahead nodes already have a slot assigned, use it,
 | 
						|
        // otherwise assign a new one.
 | 
						|
 | 
						|
        bool sawLookAheadNode = false;
 | 
						|
        for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
 | 
						|
            RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
 | 
						|
            if (node->fType != RBBINode::NodeType::lookAhead) {
 | 
						|
                continue;
 | 
						|
            }
 | 
						|
            sawLookAheadNode = true;
 | 
						|
            int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
 | 
						|
            U_ASSERT(ruleNum < fLookAheadRuleMap->size());
 | 
						|
            U_ASSERT(ruleNum > 0);
 | 
						|
            int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
 | 
						|
            if (laSlot != 0) {
 | 
						|
                if (laSlotForState == 0) {
 | 
						|
                    laSlotForState = laSlot;
 | 
						|
                } else {
 | 
						|
                    // TODO: figure out if this can fail, change to setting an error code if so.
 | 
						|
                    U_ASSERT(laSlot == laSlotForState);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
        if (!sawLookAheadNode) {
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        if (laSlotForState == 0) {
 | 
						|
            laSlotForState = ++fLASlotsInUse;
 | 
						|
        }
 | 
						|
 | 
						|
        // For each look ahead node covered by this state,
 | 
						|
        // set the mapping from the node's rule number to the look ahead slot.
 | 
						|
        // There can be multiple nodes/rule numbers going to the same la slot.
 | 
						|
 | 
						|
        for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
 | 
						|
            RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
 | 
						|
            if (node->fType != RBBINode::NodeType::lookAhead) {
 | 
						|
                continue;
 | 
						|
            }
 | 
						|
            int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
 | 
						|
            int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
 | 
						|
            (void)existingVal;
 | 
						|
            U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
 | 
						|
            fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   flagAcceptingStates    Identify accepting states.
 | 
						|
//                          First get a list of all of the end marker nodes.
 | 
						|
//                          Then, for each state s,
 | 
						|
//                              if s contains one of the end marker nodes in its list of tree positions then
 | 
						|
//                                  s is an accepting state.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void     RBBITableBuilder::flagAcceptingStates() {
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    UVector     endMarkerNodes(*fStatus);
 | 
						|
    RBBINode    *endMarker;
 | 
						|
    int32_t     i;
 | 
						|
    int32_t     n;
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    for (i=0; i<endMarkerNodes.size(); i++) {
 | 
						|
        endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
 | 
						|
        for (n=0; n<fDStates->size(); n++) {
 | 
						|
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
            if (sd->fPositions->indexOf(endMarker) >= 0) {
 | 
						|
                // Any non-zero value for fAccepting means this is an accepting node.
 | 
						|
                // The value is what will be returned to the user as the break status.
 | 
						|
                // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
 | 
						|
 | 
						|
                if (sd->fAccepting==0) {
 | 
						|
                    // State hasn't been marked as accepting yet.  Do it now.
 | 
						|
                    sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
 | 
						|
                    if (sd->fAccepting == 0) {
 | 
						|
                        sd->fAccepting = ACCEPTING_UNCONDITIONAL;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
 | 
						|
                    // Both lookahead and non-lookahead accepting for this state.
 | 
						|
                    // Favor the look-ahead, because a look-ahead match needs to
 | 
						|
                    // immediately stop the run-time engine. First match, not longest.
 | 
						|
                    sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
 | 
						|
                }
 | 
						|
                // implicit else:
 | 
						|
                // if sd->fAccepting already had a value other than 0 or 1, leave it be.
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void     RBBITableBuilder::flagLookAheadStates() {
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    UVector     lookAheadNodes(*fStatus);
 | 
						|
    RBBINode    *lookAheadNode;
 | 
						|
    int32_t     i;
 | 
						|
    int32_t     n;
 | 
						|
 | 
						|
    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    for (i=0; i<lookAheadNodes.size(); i++) {
 | 
						|
        lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
 | 
						|
        U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
 | 
						|
 | 
						|
        for (n=0; n<fDStates->size(); n++) {
 | 
						|
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
            int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
 | 
						|
            if (positionsIdx >= 0) {
 | 
						|
                U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
 | 
						|
                uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
 | 
						|
                U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
 | 
						|
                // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
 | 
						|
                //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
 | 
						|
                //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
 | 
						|
                // }
 | 
						|
                sd->fLookAhead = lookaheadSlot;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//    flagTaggedStates
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void     RBBITableBuilder::flagTaggedStates() {
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    UVector     tagNodes(*fStatus);
 | 
						|
    RBBINode    *tagNode;
 | 
						|
    int32_t     i;
 | 
						|
    int32_t     n;
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
 | 
						|
        tagNode = (RBBINode *)tagNodes.elementAt(i);
 | 
						|
 | 
						|
        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
 | 
						|
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
 | 
						|
                sortedAdd(&sd->fTagVals, tagNode->fVal);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  mergeRuleStatusVals
 | 
						|
//
 | 
						|
//      Update the global table of rule status {tag} values
 | 
						|
//      The rule builder has a global vector of status values that are common
 | 
						|
//      for all tables.  Merge the ones from this table into the global set.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void  RBBITableBuilder::mergeRuleStatusVals() {
 | 
						|
    //
 | 
						|
    //  The basic outline of what happens here is this...
 | 
						|
    //
 | 
						|
    //    for each state in this state table
 | 
						|
    //       if the status tag list for this state is in the global statuses list
 | 
						|
    //           record where and
 | 
						|
    //           continue with the next state
 | 
						|
    //       else
 | 
						|
    //           add the tag list for this state to the global list.
 | 
						|
    //
 | 
						|
    int i;
 | 
						|
    int n;
 | 
						|
 | 
						|
    // Pre-set a single tag of {0} into the table.
 | 
						|
    //   We will need this as a default, for rule sets with no explicit tagging.
 | 
						|
    if (fRB->fRuleStatusVals->size() == 0) {
 | 
						|
        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
 | 
						|
        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
 | 
						|
    }
 | 
						|
 | 
						|
    //    For each state
 | 
						|
    for (n=0; n<fDStates->size(); n++) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
        UVector *thisStatesTagValues = sd->fTagVals;
 | 
						|
        if (thisStatesTagValues == NULL) {
 | 
						|
            // No tag values are explicitly associated with this state.
 | 
						|
            //   Set the default tag value.
 | 
						|
            sd->fTagsIdx = 0;
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        // There are tag(s) associated with this state.
 | 
						|
        //   fTagsIdx will be the index into the global tag list for this state's tag values.
 | 
						|
        //   Initial value of -1 flags that we haven't got it set yet.
 | 
						|
        sd->fTagsIdx = -1;
 | 
						|
        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
 | 
						|
        int32_t  nextTagGroupStart = 0;
 | 
						|
 | 
						|
        // Loop runs once per group of tags in the global list
 | 
						|
        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
 | 
						|
            thisTagGroupStart = nextTagGroupStart;
 | 
						|
            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
 | 
						|
            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
 | 
						|
                // The number of tags for this state is different from
 | 
						|
                //    the number of tags in this group from the global list.
 | 
						|
                //    Continue with the next group from the global list.
 | 
						|
                continue;
 | 
						|
            }
 | 
						|
            // The lengths match, go ahead and compare the actual tag values
 | 
						|
            //    between this state and the group from the global list.
 | 
						|
            for (i=0; i<thisStatesTagValues->size(); i++) {
 | 
						|
                if (thisStatesTagValues->elementAti(i) !=
 | 
						|
                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
 | 
						|
                    // Mismatch.
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            if (i == thisStatesTagValues->size()) {
 | 
						|
                // We found a set of tag values in the global list that match
 | 
						|
                //   those for this state.  Use them.
 | 
						|
                sd->fTagsIdx = thisTagGroupStart;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        if (sd->fTagsIdx == -1) {
 | 
						|
            // No suitable entry in the global tag list already.  Add one
 | 
						|
            sd->fTagsIdx = fRB->fRuleStatusVals->size();
 | 
						|
            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
 | 
						|
            for (i=0; i<thisStatesTagValues->size(); i++) {
 | 
						|
                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  sortedAdd  Add a value to a vector of sorted values (ints).
 | 
						|
//             Do not replicate entries; if the value is already there, do not
 | 
						|
//                add a second one.
 | 
						|
//             Lazily create the vector if it does not already exist.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
 | 
						|
    int32_t i;
 | 
						|
 | 
						|
    if (*vector == NULL) {
 | 
						|
        *vector = new UVector(*fStatus);
 | 
						|
    }
 | 
						|
    if (*vector == NULL || U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    UVector *vec = *vector;
 | 
						|
    int32_t  vSize = vec->size();
 | 
						|
    for (i=0; i<vSize; i++) {
 | 
						|
        int32_t valAtI = vec->elementAti(i);
 | 
						|
        if (valAtI == val) {
 | 
						|
            // The value is already in the vector.  Don't add it again.
 | 
						|
            return;
 | 
						|
        }
 | 
						|
        if (valAtI > val) {
 | 
						|
            break;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    vec->insertElementAt(val, i, *fStatus);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  setAdd     Set operation on UVector
 | 
						|
//             dest = dest union source
 | 
						|
//             Elements may only appear once and must be sorted.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
 | 
						|
    int32_t destOriginalSize = dest->size();
 | 
						|
    int32_t sourceSize       = source->size();
 | 
						|
    int32_t di           = 0;
 | 
						|
    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
 | 
						|
    void **destPtr, **sourcePtr;
 | 
						|
    void **destLim, **sourceLim;
 | 
						|
 | 
						|
    if (destOriginalSize > destArray.getCapacity()) {
 | 
						|
        if (destArray.resize(destOriginalSize) == NULL) {
 | 
						|
            return;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    destPtr = destArray.getAlias();
 | 
						|
    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
 | 
						|
 | 
						|
    if (sourceSize > sourceArray.getCapacity()) {
 | 
						|
        if (sourceArray.resize(sourceSize) == NULL) {
 | 
						|
            return;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    sourcePtr = sourceArray.getAlias();
 | 
						|
    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
 | 
						|
 | 
						|
    // Avoid multiple "get element" calls by getting the contents into arrays
 | 
						|
    (void) dest->toArray(destPtr);
 | 
						|
    (void) source->toArray(sourcePtr);
 | 
						|
 | 
						|
    dest->setSize(sourceSize+destOriginalSize, *fStatus);
 | 
						|
 | 
						|
    while (sourcePtr < sourceLim && destPtr < destLim) {
 | 
						|
        if (*destPtr == *sourcePtr) {
 | 
						|
            dest->setElementAt(*sourcePtr++, di++);
 | 
						|
            destPtr++;
 | 
						|
        }
 | 
						|
        // This check is required for machines with segmented memory, like i5/OS.
 | 
						|
        // Direct pointer comparison is not recommended.
 | 
						|
        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
 | 
						|
            dest->setElementAt(*destPtr++, di++);
 | 
						|
        }
 | 
						|
        else { /* *sourcePtr < *destPtr */
 | 
						|
            dest->setElementAt(*sourcePtr++, di++);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // At most one of these two cleanup loops will execute
 | 
						|
    while (destPtr < destLim) {
 | 
						|
        dest->setElementAt(*destPtr++, di++);
 | 
						|
    }
 | 
						|
    while (sourcePtr < sourceLim) {
 | 
						|
        dest->setElementAt(*sourcePtr++, di++);
 | 
						|
    }
 | 
						|
 | 
						|
    dest->setSize(di, *fStatus);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  setEqual    Set operation on UVector.
 | 
						|
//              Compare for equality.
 | 
						|
//              Elements must be sorted.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
 | 
						|
    return a->equals(*b);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
 | 
						|
//                 for each node in the tree.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
void RBBITableBuilder::printPosSets(RBBINode *n) {
 | 
						|
    if (n==NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    printf("\n");
 | 
						|
    RBBINode::printNodeHeader();
 | 
						|
    RBBINode::printNode(n);
 | 
						|
    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
 | 
						|
 | 
						|
    RBBIDebugPrintf("         firstpos:  ");
 | 
						|
    printSet(n->fFirstPosSet);
 | 
						|
 | 
						|
    RBBIDebugPrintf("         lastpos:   ");
 | 
						|
    printSet(n->fLastPosSet);
 | 
						|
 | 
						|
    RBBIDebugPrintf("         followpos: ");
 | 
						|
    printSet(n->fFollowPos);
 | 
						|
 | 
						|
    printPosSets(n->fLeftChild);
 | 
						|
    printPosSets(n->fRightChild);
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
//
 | 
						|
//    findDuplCharClassFrom()
 | 
						|
//
 | 
						|
bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
 | 
						|
    int32_t numStates = fDStates->size();
 | 
						|
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
 | 
						|
    for (; categories->first < numCols-1; categories->first++) {
 | 
						|
        // Note: dictionary & non-dictionary columns cannot be merged.
 | 
						|
        //       The limitSecond value prevents considering mixed pairs.
 | 
						|
        //       Dictionary categories are >= DictCategoriesStart.
 | 
						|
        //       Non dict categories are   <  DictCategoriesStart.
 | 
						|
        int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
 | 
						|
            fRB->fSetBuilder->getDictCategoriesStart() : numCols;
 | 
						|
        for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
 | 
						|
            // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
 | 
						|
            uint16_t table_base = 0;
 | 
						|
            uint16_t table_dupl = 1;
 | 
						|
            for (int32_t state=0; state<numStates; state++) {
 | 
						|
                RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
 | 
						|
                table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
 | 
						|
                table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
 | 
						|
                if (table_base != table_dupl) {
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
            }
 | 
						|
            if (table_base == table_dupl) {
 | 
						|
                return true;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    return false;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//
 | 
						|
//    removeColumn()
 | 
						|
//
 | 
						|
void RBBITableBuilder::removeColumn(int32_t column) {
 | 
						|
    int32_t numStates = fDStates->size();
 | 
						|
    for (int32_t state=0; state<numStates; state++) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
 | 
						|
        U_ASSERT(column < sd->fDtran->size());
 | 
						|
        sd->fDtran->removeElementAt(column);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * findDuplicateState
 | 
						|
 */
 | 
						|
bool RBBITableBuilder::findDuplicateState(IntPair *states) {
 | 
						|
    int32_t numStates = fDStates->size();
 | 
						|
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
 | 
						|
    for (; states->first<numStates-1; states->first++) {
 | 
						|
        RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
 | 
						|
        for (states->second=states->first+1; states->second<numStates; states->second++) {
 | 
						|
            RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
 | 
						|
            if (firstSD->fAccepting != duplSD->fAccepting ||
 | 
						|
                firstSD->fLookAhead != duplSD->fLookAhead ||
 | 
						|
                firstSD->fTagsIdx   != duplSD->fTagsIdx) {
 | 
						|
                continue;
 | 
						|
            }
 | 
						|
            bool rowsMatch = true;
 | 
						|
            for (int32_t col=0; col < numCols; ++col) {
 | 
						|
                int32_t firstVal = firstSD->fDtran->elementAti(col);
 | 
						|
                int32_t duplVal = duplSD->fDtran->elementAti(col);
 | 
						|
                if (!((firstVal == duplVal) ||
 | 
						|
                        ((firstVal == states->first || firstVal == states->second) &&
 | 
						|
                        (duplVal  == states->first || duplVal  == states->second)))) {
 | 
						|
                    rowsMatch = false;
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
            }
 | 
						|
            if (rowsMatch) {
 | 
						|
                return true;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    return false;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
 | 
						|
    int32_t numStates = fSafeTable->size();
 | 
						|
 | 
						|
    for (; states->first<numStates-1; states->first++) {
 | 
						|
        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
 | 
						|
        for (states->second=states->first+1; states->second<numStates; states->second++) {
 | 
						|
            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
 | 
						|
            bool rowsMatch = true;
 | 
						|
            int32_t numCols = firstRow->length();
 | 
						|
            for (int32_t col=0; col < numCols; ++col) {
 | 
						|
                int32_t firstVal = firstRow->charAt(col);
 | 
						|
                int32_t duplVal = duplRow->charAt(col);
 | 
						|
                if (!((firstVal == duplVal) ||
 | 
						|
                        ((firstVal == states->first || firstVal == states->second) &&
 | 
						|
                        (duplVal  == states->first || duplVal  == states->second)))) {
 | 
						|
                    rowsMatch = false;
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
            }
 | 
						|
            if (rowsMatch) {
 | 
						|
                return true;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    return false;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
void RBBITableBuilder::removeState(IntPair duplStates) {
 | 
						|
    const int32_t keepState = duplStates.first;
 | 
						|
    const int32_t duplState = duplStates.second;
 | 
						|
    U_ASSERT(keepState < duplState);
 | 
						|
    U_ASSERT(duplState < fDStates->size());
 | 
						|
 | 
						|
    RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
 | 
						|
    fDStates->removeElementAt(duplState);
 | 
						|
    delete duplSD;
 | 
						|
 | 
						|
    int32_t numStates = fDStates->size();
 | 
						|
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
    for (int32_t state=0; state<numStates; ++state) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
 | 
						|
        for (int32_t col=0; col<numCols; col++) {
 | 
						|
            int32_t existingVal = sd->fDtran->elementAti(col);
 | 
						|
            int32_t newVal = existingVal;
 | 
						|
            if (existingVal == duplState) {
 | 
						|
                newVal = keepState;
 | 
						|
            } else if (existingVal > duplState) {
 | 
						|
                newVal = existingVal - 1;
 | 
						|
            }
 | 
						|
            sd->fDtran->setElementAt(newVal, col);
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
void RBBITableBuilder::removeSafeState(IntPair duplStates) {
 | 
						|
    const int32_t keepState = duplStates.first;
 | 
						|
    const int32_t duplState = duplStates.second;
 | 
						|
    U_ASSERT(keepState < duplState);
 | 
						|
    U_ASSERT(duplState < fSafeTable->size());
 | 
						|
 | 
						|
    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
 | 
						|
                                              // and will auto-delete the removed element.
 | 
						|
    int32_t numStates = fSafeTable->size();
 | 
						|
    for (int32_t state=0; state<numStates; ++state) {
 | 
						|
        UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
 | 
						|
        int32_t numCols = sd->length();
 | 
						|
        for (int32_t col=0; col<numCols; col++) {
 | 
						|
            int32_t existingVal = sd->charAt(col);
 | 
						|
            int32_t newVal = existingVal;
 | 
						|
            if (existingVal == duplState) {
 | 
						|
                newVal = keepState;
 | 
						|
            } else if (existingVal > duplState) {
 | 
						|
                newVal = existingVal - 1;
 | 
						|
            }
 | 
						|
            sd->setCharAt(col, static_cast<char16_t>(newVal));
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * RemoveDuplicateStates
 | 
						|
 */
 | 
						|
int32_t RBBITableBuilder::removeDuplicateStates() {
 | 
						|
    IntPair dupls = {3, 0};
 | 
						|
    int32_t numStatesRemoved = 0;
 | 
						|
 | 
						|
    while (findDuplicateState(&dupls)) {
 | 
						|
        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
 | 
						|
        removeState(dupls);
 | 
						|
        ++numStatesRemoved;
 | 
						|
    }
 | 
						|
    return numStatesRemoved;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   getTableSize()    Calculate the size of the runtime form of this
 | 
						|
//                     state transition table.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
int32_t  RBBITableBuilder::getTableSize() const {
 | 
						|
    int32_t    size = 0;
 | 
						|
    int32_t    numRows;
 | 
						|
    int32_t    numCols;
 | 
						|
    int32_t    rowSize;
 | 
						|
 | 
						|
    if (fTree == NULL) {
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
 | 
						|
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
 | 
						|
 | 
						|
    numRows = fDStates->size();
 | 
						|
    numCols = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
 | 
						|
    if (use8BitsForTable()) {
 | 
						|
        rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
 | 
						|
    } else {
 | 
						|
        rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
 | 
						|
    }
 | 
						|
    size   += numRows * rowSize;
 | 
						|
    return size;
 | 
						|
}
 | 
						|
 | 
						|
bool RBBITableBuilder::use8BitsForTable() const {
 | 
						|
    return fDStates->size() <= kMaxStateFor8BitsTable;
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   exportTable()    export the state transition table in the format required
 | 
						|
//                    by the runtime engine.  getTableSize() bytes of memory
 | 
						|
//                    must be available at the output address "where".
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::exportTable(void *where) {
 | 
						|
    RBBIStateTable    *table = (RBBIStateTable *)where;
 | 
						|
    uint32_t           state;
 | 
						|
    int                col;
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus) || fTree == NULL) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
    if (catCount > 0x7fff ||
 | 
						|
        fDStates->size() > 0x7fff) {
 | 
						|
        *fStatus = U_BRK_INTERNAL_ERROR;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    table->fNumStates = fDStates->size();
 | 
						|
    table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
 | 
						|
    table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
 | 
						|
    table->fFlags     = 0;
 | 
						|
    if (use8BitsForTable()) {
 | 
						|
        table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
 | 
						|
        table->fFlags  |= RBBI_8BITS_ROWS;
 | 
						|
    } else {
 | 
						|
        table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
 | 
						|
    }
 | 
						|
    if (fRB->fLookAheadHardBreak) {
 | 
						|
        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
 | 
						|
    }
 | 
						|
    if (fRB->fSetBuilder->sawBOF()) {
 | 
						|
        table->fFlags  |= RBBI_BOF_REQUIRED;
 | 
						|
    }
 | 
						|
 | 
						|
    for (state=0; state<table->fNumStates; state++) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
 | 
						|
        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
 | 
						|
        if (use8BitsForTable()) {
 | 
						|
            U_ASSERT (sd->fAccepting <= 255);
 | 
						|
            U_ASSERT (sd->fLookAhead <= 255);
 | 
						|
            U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
 | 
						|
            RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
 | 
						|
            r8->fAccepting = sd->fAccepting;
 | 
						|
            r8->fLookAhead = sd->fLookAhead;
 | 
						|
            r8->fTagsIdx   = sd->fTagsIdx;
 | 
						|
            for (col=0; col<catCount; col++) {
 | 
						|
                U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
 | 
						|
                r8->fNextState[col] = sd->fDtran->elementAti(col);
 | 
						|
            }
 | 
						|
        } else {
 | 
						|
            U_ASSERT (sd->fAccepting <= 0xffff);
 | 
						|
            U_ASSERT (sd->fLookAhead <= 0xffff);
 | 
						|
            U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
 | 
						|
            row->r16.fAccepting = sd->fAccepting;
 | 
						|
            row->r16.fLookAhead = sd->fLookAhead;
 | 
						|
            row->r16.fTagsIdx   = sd->fTagsIdx;
 | 
						|
            for (col=0; col<catCount; col++) {
 | 
						|
                row->r16.fNextState[col] = sd->fDtran->elementAti(col);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
 *   Synthesize a safe state table from the main state table.
 | 
						|
 */
 | 
						|
void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
 | 
						|
    // The safe table creation has three steps:
 | 
						|
 | 
						|
    // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
 | 
						|
    // following the pair do not depend on context or state before the pair. To test
 | 
						|
    // whether a pair is safe, run it through the main forward state table, starting
 | 
						|
    // from each state. If the the final state is the same, no matter what the starting state,
 | 
						|
    // the pair is safe.
 | 
						|
    //
 | 
						|
    // 2. Build a state table that recognizes the safe pairs. It's similar to their
 | 
						|
    // forward table, with a column for each input character [class], and a row for
 | 
						|
    // each state. Row 1 is the start state, and row 0 is the stop state. Initially
 | 
						|
    // create an additional state for each input character category; being in
 | 
						|
    // one of these states means that the character has been seen, and is potentially
 | 
						|
    // the first of a pair. In each of these rows, the entry for the second character
 | 
						|
    // of a safe pair is set to the stop state (0), indicating that a match was found.
 | 
						|
    // All other table entries are set to the state corresponding the current input
 | 
						|
    // character, allowing that charcter to be the of a start following pair.
 | 
						|
    //
 | 
						|
    // Because the safe rules are to be run in reverse, moving backwards in the text,
 | 
						|
    // the first and second pair categories are swapped when building the table.
 | 
						|
    //
 | 
						|
    // 3. Compress the table. There are typically many rows (states) that are
 | 
						|
    // equivalent - that have zeroes (match completed) in the same columns -
 | 
						|
    // and can be folded together.
 | 
						|
 | 
						|
    // Each safe pair is stored as two UChars in the safePair string.
 | 
						|
    UnicodeString safePairs;
 | 
						|
 | 
						|
    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
    int32_t numStates = fDStates->size();
 | 
						|
 | 
						|
    for (int32_t c1=0; c1<numCharClasses; ++c1) {
 | 
						|
        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
 | 
						|
            int32_t wantedEndState = -1;
 | 
						|
            int32_t endState = 0;
 | 
						|
            for (int32_t startState = 1; startState < numStates; ++startState) {
 | 
						|
                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
 | 
						|
                int32_t s2 = startStateD->fDtran->elementAti(c1);
 | 
						|
                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
 | 
						|
                endState = s2StateD->fDtran->elementAti(c2);
 | 
						|
                if (wantedEndState < 0) {
 | 
						|
                    wantedEndState = endState;
 | 
						|
                } else {
 | 
						|
                    if (wantedEndState != endState) {
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
            if (wantedEndState == endState) {
 | 
						|
                safePairs.append((char16_t)c1);
 | 
						|
                safePairs.append((char16_t)c2);
 | 
						|
                // printf("(%d, %d) ", c1, c2);
 | 
						|
            }
 | 
						|
        }
 | 
						|
        // printf("\n");
 | 
						|
    }
 | 
						|
 | 
						|
    // Populate the initial safe table.
 | 
						|
    // The table as a whole is UVector<UnicodeString>
 | 
						|
    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
 | 
						|
    // Row 0 is the stop state.
 | 
						|
    // Row 1 is the start sate.
 | 
						|
    // Row 2 and beyond are other states, initially one per char class, but
 | 
						|
    //   after initial construction, many of the states will be combined, compacting the table.
 | 
						|
    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
 | 
						|
    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
 | 
						|
 | 
						|
    U_ASSERT(fSafeTable == nullptr);
 | 
						|
    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
 | 
						|
    for (int32_t row=0; row<numCharClasses + 2; ++row) {
 | 
						|
        fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
 | 
						|
    }
 | 
						|
 | 
						|
    // From the start state, each input char class transitions to the state for that input.
 | 
						|
    UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
 | 
						|
    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
 | 
						|
        // Note: +2 for the start & stop state.
 | 
						|
        startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
 | 
						|
    }
 | 
						|
 | 
						|
    // Initially make every other state table row look like the start state row,
 | 
						|
    for (int32_t row=2; row<numCharClasses+2; ++row) {
 | 
						|
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
 | 
						|
        rowState = startState;   // UnicodeString assignment, copies contents.
 | 
						|
    }
 | 
						|
 | 
						|
    // Run through the safe pairs, set the next state to zero when pair has been seen.
 | 
						|
    // Zero being the stop state, meaning we found a safe point.
 | 
						|
    for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
 | 
						|
        int32_t c1 = safePairs.charAt(pairIdx);
 | 
						|
        int32_t c2 = safePairs.charAt(pairIdx + 1);
 | 
						|
 | 
						|
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
 | 
						|
        rowState.setCharAt(c1, 0);
 | 
						|
    }
 | 
						|
 | 
						|
    // Remove duplicate or redundant rows from the table.
 | 
						|
    IntPair states = {1, 0};
 | 
						|
    while (findDuplicateSafeState(&states)) {
 | 
						|
        // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
 | 
						|
        removeSafeState(states);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   getSafeTableSize()    Calculate the size of the runtime form of this
 | 
						|
//                         safe state table.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
int32_t  RBBITableBuilder::getSafeTableSize() const {
 | 
						|
    int32_t    size = 0;
 | 
						|
    int32_t    numRows;
 | 
						|
    int32_t    numCols;
 | 
						|
    int32_t    rowSize;
 | 
						|
 | 
						|
    if (fSafeTable == nullptr) {
 | 
						|
        return 0;
 | 
						|
    }
 | 
						|
 | 
						|
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
 | 
						|
 | 
						|
    numRows = fSafeTable->size();
 | 
						|
    numCols = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
 | 
						|
    if (use8BitsForSafeTable()) {
 | 
						|
        rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
 | 
						|
    } else {
 | 
						|
        rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
 | 
						|
    }
 | 
						|
    size   += numRows * rowSize;
 | 
						|
    return size;
 | 
						|
}
 | 
						|
 | 
						|
bool RBBITableBuilder::use8BitsForSafeTable() const {
 | 
						|
    return fSafeTable->size() <= kMaxStateFor8BitsTable;
 | 
						|
}
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   exportSafeTable()   export the state transition table in the format required
 | 
						|
//                       by the runtime engine.  getTableSize() bytes of memory
 | 
						|
//                       must be available at the output address "where".
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
void RBBITableBuilder::exportSafeTable(void *where) {
 | 
						|
    RBBIStateTable    *table = (RBBIStateTable *)where;
 | 
						|
    uint32_t           state;
 | 
						|
    int                col;
 | 
						|
 | 
						|
    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
 | 
						|
    if (catCount > 0x7fff ||
 | 
						|
            fSafeTable->size() > 0x7fff) {
 | 
						|
        *fStatus = U_BRK_INTERNAL_ERROR;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    table->fNumStates = fSafeTable->size();
 | 
						|
    table->fFlags     = 0;
 | 
						|
    if (use8BitsForSafeTable()) {
 | 
						|
        table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
 | 
						|
        table->fFlags  |= RBBI_8BITS_ROWS;
 | 
						|
    } else {
 | 
						|
        table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
 | 
						|
    }
 | 
						|
 | 
						|
    for (state=0; state<table->fNumStates; state++) {
 | 
						|
        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
 | 
						|
        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
 | 
						|
        if (use8BitsForSafeTable()) {
 | 
						|
            RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
 | 
						|
            r8->fAccepting = 0;
 | 
						|
            r8->fLookAhead = 0;
 | 
						|
            r8->fTagsIdx    = 0;
 | 
						|
            for (col=0; col<catCount; col++) {
 | 
						|
                U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
 | 
						|
                r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
 | 
						|
            }
 | 
						|
        } else {
 | 
						|
            row->r16.fAccepting = 0;
 | 
						|
            row->r16.fLookAhead = 0;
 | 
						|
            row->r16.fTagsIdx    = 0;
 | 
						|
            for (col=0; col<catCount; col++) {
 | 
						|
                row->r16.fNextState[col] = rowString->charAt(col);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   printSet    Debug function.   Print the contents of a UVector
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
void RBBITableBuilder::printSet(UVector *s) {
 | 
						|
    int32_t  i;
 | 
						|
    for (i=0; i<s->size(); i++) {
 | 
						|
        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
 | 
						|
        RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n");
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   printStates    Debug Function.  Dump the fully constructed state transition table.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
void RBBITableBuilder::printStates() {
 | 
						|
    int     c;    // input "character"
 | 
						|
    int     n;    // state number
 | 
						|
 | 
						|
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
 | 
						|
    RBBIDebugPrintf("      | Acc  LA    Tag");
 | 
						|
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
        RBBIDebugPrintf(" %3d", c);
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n");
 | 
						|
    RBBIDebugPrintf("      |---------------");
 | 
						|
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
        RBBIDebugPrintf("----");
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n");
 | 
						|
 | 
						|
    for (n=0; n<fDStates->size(); n++) {
 | 
						|
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
 | 
						|
        RBBIDebugPrintf("  %3d | " , n);
 | 
						|
        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
 | 
						|
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
            RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
 | 
						|
        }
 | 
						|
        RBBIDebugPrintf("\n");
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n\n");
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
void RBBITableBuilder::printReverseTable() {
 | 
						|
    int     c;    // input "character"
 | 
						|
    int     n;    // state number
 | 
						|
 | 
						|
    RBBIDebugPrintf("    Safe Reverse Table \n");
 | 
						|
    if (fSafeTable == nullptr) {
 | 
						|
        RBBIDebugPrintf("   --- nullptr ---\n");
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
 | 
						|
    RBBIDebugPrintf("      | Acc  LA    Tag");
 | 
						|
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
        RBBIDebugPrintf(" %2d", c);
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n");
 | 
						|
    RBBIDebugPrintf("      |---------------");
 | 
						|
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
        RBBIDebugPrintf("---");
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n");
 | 
						|
 | 
						|
    for (n=0; n<fSafeTable->size(); n++) {
 | 
						|
        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
 | 
						|
        RBBIDebugPrintf("  %3d | " , n);
 | 
						|
        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
 | 
						|
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
 | 
						|
            RBBIDebugPrintf(" %2d", rowString->charAt(c));
 | 
						|
        }
 | 
						|
        RBBIDebugPrintf("\n");
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n\n");
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   printRuleStatusTable    Debug Function.  Dump the common rule status table
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
#ifdef RBBI_DEBUG
 | 
						|
void RBBITableBuilder::printRuleStatusTable() {
 | 
						|
    int32_t  thisRecord = 0;
 | 
						|
    int32_t  nextRecord = 0;
 | 
						|
    int      i;
 | 
						|
    UVector  *tbl = fRB->fRuleStatusVals;
 | 
						|
 | 
						|
    RBBIDebugPrintf("index |  tags \n");
 | 
						|
    RBBIDebugPrintf("-------------------\n");
 | 
						|
 | 
						|
    while (nextRecord < tbl->size()) {
 | 
						|
        thisRecord = nextRecord;
 | 
						|
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
 | 
						|
        RBBIDebugPrintf("%4d   ", thisRecord);
 | 
						|
        for (i=thisRecord+1; i<nextRecord; i++) {
 | 
						|
            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
 | 
						|
        }
 | 
						|
        RBBIDebugPrintf("\n");
 | 
						|
    }
 | 
						|
    RBBIDebugPrintf("\n\n");
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   RBBIStateDescriptor     Methods.  This is a very struct-like class
 | 
						|
//                           Most access is directly to the fields.
 | 
						|
//
 | 
						|
//-----------------------------------------------------------------------------
 | 
						|
 | 
						|
RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
 | 
						|
    fMarked    = FALSE;
 | 
						|
    fAccepting = 0;
 | 
						|
    fLookAhead = 0;
 | 
						|
    fTagsIdx   = 0;
 | 
						|
    fTagVals   = NULL;
 | 
						|
    fPositions = NULL;
 | 
						|
    fDtran     = NULL;
 | 
						|
 | 
						|
    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
 | 
						|
    if (U_FAILURE(*fStatus)) {
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    if (fDtran == NULL) {
 | 
						|
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
 | 
						|
                                           //   It is indexed by input symbols, and will
 | 
						|
                                           //   hold  the next state number for each
 | 
						|
                                           //   symbol.
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
RBBIStateDescriptor::~RBBIStateDescriptor() {
 | 
						|
    delete       fPositions;
 | 
						|
    delete       fDtran;
 | 
						|
    delete       fTagVals;
 | 
						|
    fPositions = NULL;
 | 
						|
    fDtran     = NULL;
 | 
						|
    fTagVals   = NULL;
 | 
						|
}
 | 
						|
 | 
						|
U_NAMESPACE_END
 | 
						|
 | 
						|
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 |