/* * Copyright 2001-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: RegularExpression.cpp 225473 2005-07-27 07:09:16Z dbertoni $ */ // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN // --------------------------------------------------------------------------- // Static member data initialization // --------------------------------------------------------------------------- const unsigned int RegularExpression::MARK_PARENS = 1; const unsigned int RegularExpression::IGNORE_CASE = 2; const unsigned int RegularExpression::SINGLE_LINE = 4; const unsigned int RegularExpression::MULTIPLE_LINE = 8; const unsigned int RegularExpression::EXTENDED_COMMENT = 16; const unsigned int RegularExpression::USE_UNICODE_CATEGORY = 32; const unsigned int RegularExpression::UNICODE_WORD_BOUNDARY = 64; const unsigned int RegularExpression::PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 128; const unsigned int RegularExpression::PROHIBIT_FIXED_STRING_OPTIMIZATION = 256; const unsigned int RegularExpression::XMLSCHEMA_MODE = 512; const unsigned int RegularExpression::SPECIAL_COMMA = 1024; const unsigned short RegularExpression::WT_IGNORE = 0; const unsigned short RegularExpression::WT_LETTER = 1; const unsigned short RegularExpression::WT_OTHER = 2; RangeToken* RegularExpression::fWordRange = 0; static void localCleanup() { RegularExpression::staticCleanup(); } static XMLRegisterCleanup WordRangeCleanup; bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2) { if (ch1 >= 0x10000) { XMLCh string1[2]; XMLCh string2[2]; RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]); if (ch2 >= 0x10000) { RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); } else { // XMLString::compareNIString is broken, because it assume the // two strings must be of the same length. Note that two strings // of different length could compare as equal, because there is no // guarantee that a Unicode code point that is encoded in UTF-16 as // a surrogate pair does not have a case mapping to a code point // that is not in the surrogate range. Just to be safe, we pad the // shorter string with a space, which cannot hvae a case mapping. string2[0] = (XMLCh)ch2; string2[1] = chSpace; } return (0==XMLString::compareNIString(string1, string2, 2)); } else if (ch2 >= 0x10000) { const XMLCh string1[2] = { (XMLCh)ch1, chSpace }; XMLCh string2[2]; RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); return (0==XMLString::compareNIString(string1, string2, 2)); } else { const XMLCh char1 = (XMLCh)ch1; const XMLCh char2 = (XMLCh)ch2; return (0==XMLString::compareNIString(&char1, &char2, 1)); } } // --------------------------------------------------------------------------- // RegularExpression::Context: Constructors and Destructor // --------------------------------------------------------------------------- RegularExpression::Context::Context(MemoryManager* const manager) : fAdoptMatch(false) , fStart(0) , fLimit(0) , fLength(0) , fSize(0) , fStringMaxLen(0) , fOffsets(0) , fMatch(0) , fString(0) , fMemoryManager(manager) { } RegularExpression::Context::~Context() { if (fOffsets) fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; if (fAdoptMatch) delete fMatch; } // --------------------------------------------------------------------------- // RegularExpression::Context: Public methods // --------------------------------------------------------------------------- void RegularExpression::Context::reset(const XMLCh* const string , const int stringLen , const int start , const int limit , const int noClosures) { fString = string; fStringMaxLen = stringLen; fStart = start; fLimit = limit; fLength = fLimit - fStart; if (fAdoptMatch) delete fMatch; fMatch = 0; if (fSize != noClosures) { if (fOffsets) fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; fOffsets = (int*) fMemoryManager->allocate(noClosures * sizeof(int));//new int[noClosures]; } fSize = noClosures; for (int i = 0; i< fSize; i++) fOffsets[i] = -1; } bool RegularExpression::Context::nextCh(XMLInt32& ch, int& offset, const short direction) { ch = fString[offset]; if (RegxUtil::isHighSurrogate(ch)) { if ((offset + 1 < fLimit) && (direction > 0) && RegxUtil::isLowSurrogate(fString[offset+1])) { ch = RegxUtil::composeFromSurrogate(ch, fString[++offset]); } else return false; } else if (RegxUtil::isLowSurrogate(ch)) { if ((offset - 1 >= 0) && (direction <= 0) && RegxUtil::isHighSurrogate(fString[offset-1])) { ch = RegxUtil::composeFromSurrogate(fString[--offset], ch); } else return false; } return true; } // --------------------------------------------------------------------------- // RegularExpression: Constructors and Destructors // --------------------------------------------------------------------------- typedef JanitorMemFunCall CleanupType; RegularExpression::RegularExpression(const char* const pattern, MemoryManager* const manager) :fHasBackReferences(false), fFixedStringOnly(false), fNoGroups(0), fMinLength(0), fNoClosures(0), fOptions(0), fBMPattern(0), fPattern(0), fFixedString(0), fOperations(0), fTokenTree(0), fFirstChar(0), fOpFactory(manager), fTokenFactory(0), fMemoryManager(manager) { CleanupType cleanup(this, &RegularExpression::cleanUp); try { XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); setPattern(tmpBuf); } catch(const OutOfMemoryException&) { cleanup.release(); throw; } cleanup.release(); } RegularExpression::RegularExpression(const char* const pattern, const char* const options, MemoryManager* const manager) :fHasBackReferences(false), fFixedStringOnly(false), fNoGroups(0), fMinLength(0), fNoClosures(0), fOptions(0), fBMPattern(0), fPattern(0), fFixedString(0), fOperations(0), fTokenTree(0), fFirstChar(0), fOpFactory(manager), fTokenFactory(0), fMemoryManager(manager) { CleanupType cleanup(this, &RegularExpression::cleanUp); try { XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); XMLCh* tmpOptions = XMLString::transcode(options, fMemoryManager); ArrayJanitor janOps(tmpOptions, fMemoryManager); setPattern(tmpBuf, tmpOptions); } catch(const OutOfMemoryException&) { cleanup.release(); throw; } cleanup.release(); } RegularExpression::RegularExpression(const XMLCh* const pattern, MemoryManager* const manager) :fHasBackReferences(false), fFixedStringOnly(false), fNoGroups(0), fMinLength(0), fNoClosures(0), fOptions(0), fBMPattern(0), fPattern(0), fFixedString(0), fOperations(0), fTokenTree(0), fFirstChar(0), fOpFactory(manager), fTokenFactory(0), fMemoryManager(manager) { CleanupType cleanup(this, &RegularExpression::cleanUp); try { setPattern(pattern); } catch(const OutOfMemoryException&) { cleanup.release(); throw; } cleanup.release(); } RegularExpression::RegularExpression(const XMLCh* const pattern, const XMLCh* const options, MemoryManager* const manager) :fHasBackReferences(false), fFixedStringOnly(false), fNoGroups(0), fMinLength(0), fNoClosures(0), fOptions(0), fBMPattern(0), fPattern(0), fFixedString(0), fOperations(0), fTokenTree(0), fFirstChar(0), fOpFactory(manager), fTokenFactory(0), fMemoryManager(manager) { CleanupType cleanup(this, &RegularExpression::cleanUp); try { setPattern(pattern, options); } catch(const OutOfMemoryException&) { cleanup.release(); throw; } cleanup.release(); } RegularExpression::~RegularExpression() { cleanUp(); } // --------------------------------------------------------------------------- // RegularExpression: Setter methods // --------------------------------------------------------------------------- void RegularExpression::setPattern(const XMLCh* const pattern, const XMLCh* const options) { fTokenFactory = new (fMemoryManager) TokenFactory(fMemoryManager); fOptions = parseOptions(options); fPattern = XMLString::replicate(pattern, fMemoryManager); // the following construct causes an error in an Intel 7.1 32 bit compiler for // red hat linux 7.2 // (when an exception is thrown the wrong object is deleted) //RegxParser* regxParser = isSet(fOptions, XMLSCHEMA_MODE) // ? new (fMemoryManager) ParserForXMLSchema(fMemoryManager) // : new (fMemoryManager) RegxParser(fMemoryManager); RegxParser* regxParser; if (isSet(fOptions, XMLSCHEMA_MODE)) { regxParser = new (fMemoryManager) ParserForXMLSchema(fMemoryManager); } else { regxParser = new (fMemoryManager) RegxParser(fMemoryManager); } if (regxParser) { regxParser->setTokenFactory(fTokenFactory); } Janitor janRegxParser(regxParser); fTokenTree = regxParser->parse(fPattern, fOptions); fNoGroups = regxParser->getNoParen(); fHasBackReferences = regxParser->hasBackReferences(); prepare(); } // --------------------------------------------------------------------------- // RegularExpression: Matching methods // --------------------------------------------------------------------------- bool RegularExpression::matches(const char* const expression , MemoryManager* const manager) { XMLCh* tmpBuf = XMLString::transcode(expression, manager); ArrayJanitor janBuf(tmpBuf, manager); return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), 0, manager); } bool RegularExpression::matches(const char* const expression, const int start, const int end , MemoryManager* const manager) { XMLCh* tmpBuf = XMLString::transcode(expression, manager); ArrayJanitor janBuf(tmpBuf, manager); return matches(tmpBuf, start, end, 0, manager); } bool RegularExpression::matches(const char* const expression, Match* const match , MemoryManager* const manager) { XMLCh* tmpBuf = XMLString::transcode(expression, manager); ArrayJanitor janBuf(tmpBuf, manager); return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), match, manager); } bool RegularExpression::matches(const char* const expression, const int start, const int end, Match* const pMatch , MemoryManager* const manager) { XMLCh* tmpBuf = XMLString::transcode(expression, manager); ArrayJanitor janBuf(tmpBuf, manager); return matches(tmpBuf, start, end, pMatch, manager); } // --------------------------------------------------------------------------- // RegularExpression: Matching methods - Wide char version // --------------------------------------------------------------------------- bool RegularExpression::matches(const XMLCh* const expression, MemoryManager* const manager) { return matches(expression, 0, XMLString::stringLen(expression), 0, manager); } bool RegularExpression::matches(const XMLCh* const expression, const int start, const int end , MemoryManager* const manager) { return matches(expression, start, end, 0, manager); } bool RegularExpression::matches(const XMLCh* const expression, Match* const match , MemoryManager* const manager) { return matches(expression, 0, XMLString::stringLen(expression), match, manager); } bool RegularExpression::matches(const XMLCh* const expression, const int start, const int end, Match* const pMatch , MemoryManager* const manager) { Context context(manager); int strLength = XMLString::stringLen(expression); context.reset(expression, strLength, start, end, fNoClosures); bool adoptMatch = false; Match* lMatch = pMatch; if (lMatch != 0) { lMatch->setNoGroups(fNoGroups); } else if (fHasBackReferences) { lMatch = new (fMemoryManager) Match(fMemoryManager); lMatch->setNoGroups(fNoGroups); adoptMatch = true; } if (context.fAdoptMatch) delete context.fMatch; context.fMatch = lMatch; context.fAdoptMatch = adoptMatch; if (isSet(fOptions, XMLSCHEMA_MODE)) { int matchEnd = match(&context, fOperations, context.fStart, 1); if (matchEnd == context.fLimit) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, context.fStart); context.fMatch->setEndPos(0, matchEnd); } return true; } return false; } /* * If the pattern has only fixed string, use Boyer-Moore */ if (fFixedStringOnly) { int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); if (ret >= 0) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, ret); context.fMatch->setEndPos(0, ret + strLength); } return true; } return false; } /* * If the pattern contains a fixed string, we check with Boyer-Moore * whether the text contains the fixed string or not. If not found * return false */ if (fFixedString != 0) { int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); if (ret < 0) { // No match return false; } } int limit = context.fLimit - fMinLength; int matchStart; int matchEnd = -1; /* * Check whether the expression start with ".*" */ if (fOperations != 0 && fOperations->getOpType() == Op::O_CLOSURE && fOperations->getChild()->getOpType() == Op::O_DOT) { if (isSet(fOptions, SINGLE_LINE)) { matchStart = context.fStart; matchEnd = match(&context, fOperations, matchStart, 1); } else { bool previousIsEOL = true; for (matchStart=context.fStart; matchStart<=limit; matchStart++) { XMLCh ch = expression[matchStart]; if (RegxUtil::isEOLChar(ch)) { previousIsEOL = true; } else { if (previousIsEOL) { if (0 <= (matchEnd = match(&context, fOperations, matchStart, 1))) break; } previousIsEOL = false; } } } } else { /* * Optimization against the first char */ if (fFirstChar != 0) { bool ignoreCase = isSet(fOptions, IGNORE_CASE); RangeToken* range = fFirstChar; if (ignoreCase) range = fFirstChar->getCaseInsensitiveToken(fTokenFactory); for (matchStart=context.fStart; matchStart<=limit; matchStart++) { XMLInt32 ch; if (!context.nextCh(ch, matchStart, 1)) break; if (!range->match(ch)) { continue; } if (0 <= (matchEnd = match(&context,fOperations,matchStart,1))) break; } } else { /* * Straightforward matching */ for (matchStart=context.fStart; matchStart<=limit; matchStart++) { if (0 <= (matchEnd = match(&context,fOperations,matchStart,1))) break; } } } if (matchEnd >= 0) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, matchStart); context.fMatch->setEndPos(0, matchEnd); } return true; } return false; } // --------------------------------------------------------------------------- // RegularExpression: Tokenize methods // --------------------------------------------------------------------------- RefArrayVectorOf* RegularExpression::tokenize(const char* const expression) { XMLCh* tmpBuf = XMLString::transcode(expression, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf)); } RefArrayVectorOf* RegularExpression::tokenize(const char* const expression, const int start, const int end) { XMLCh* tmpBuf = XMLString::transcode(expression, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); return tokenize(tmpBuf, start, end); } // --------------------------------------------------------------------------- // RegularExpression: Tokenize methods - Wide char version // --------------------------------------------------------------------------- RefArrayVectorOf* RegularExpression::tokenize(const XMLCh* const expression) { return tokenize(expression, 0, XMLString::stringLen(expression), 0); } RefArrayVectorOf* RegularExpression::tokenize(const XMLCh* const expression, const int start, const int end) { return tokenize(expression, start, end, 0); } RefArrayVectorOf* RegularExpression::tokenize(const XMLCh* const expression, const int start, const int end, RefVectorOf *subEx){ RefArrayVectorOf* tokenStack = new (fMemoryManager) RefArrayVectorOf(16, true, fMemoryManager); Context context(fMemoryManager); int strLength = XMLString::stringLen(expression); context.reset(expression, strLength, start, end, fNoClosures); Match* lMatch = 0; bool adoptMatch = false; if (subEx || fHasBackReferences) { lMatch = new (fMemoryManager) Match(fMemoryManager); adoptMatch = true; lMatch->setNoGroups(fNoGroups); } if (context.fAdoptMatch) delete context.fMatch; context.fMatch = lMatch; context.fAdoptMatch = adoptMatch; int tokStart = start; int matchStart = start; for (; matchStart <= end; matchStart++) { int matchEnd = match(&context, fOperations, matchStart, 1); if (matchEnd != -1) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, context.fStart); context.fMatch->setEndPos(0, matchEnd); } if (subEx){ subEx->addElement(lMatch); lMatch = new (fMemoryManager) Match(*(context.fMatch)); adoptMatch = true; context.fAdoptMatch = adoptMatch; context.fMatch = lMatch; } XMLCh* token; if (tokStart == matchStart){ if (tokStart == strLength){ tokStart--; break; } token = (XMLCh*) fMemoryManager->allocate(sizeof(XMLCh));//new XMLCh[1]; token[0] = chNull; // When you tokenize using zero string, will return each // token in the string. Since the zero string will also // match the start/end characters, resulting in empty // tokens, we ignore them and do not add them to the stack. if (!XMLString::equals(fPattern, &chNull)) tokenStack->addElement(token); else fMemoryManager->deallocate(token);//delete[] token; } else { token = (XMLCh*) fMemoryManager->allocate ( (matchStart + 1 - tokStart) * sizeof(XMLCh) );//new XMLCh[matchStart + 1 - tokStart]; XMLString::subString(token, expression, tokStart, matchStart, fMemoryManager); tokenStack->addElement(token); } tokStart = matchEnd; //decrement matchStart as will increment it at the top of the loop if (matchStart < matchEnd - 1) matchStart = matchEnd - 1; } } XMLCh* token; if (matchStart == tokStart + 1){ token = (XMLCh*) fMemoryManager->allocate(sizeof(XMLCh));//new XMLCh[1]; token[0] = chNull; } else { token = (XMLCh*) fMemoryManager->allocate ( (strLength + 1 - tokStart) * sizeof(XMLCh) );//new XMLCh[strLength + 1 - tokStart]; XMLString::subString(token, expression, tokStart, strLength, fMemoryManager); } if (!XMLString::equals(fPattern, &chNull)) tokenStack->addElement(token); else fMemoryManager->deallocate(token);//delete[] token; return tokenStack; } // ----------------------------------------------------------------------- // RegularExpression: Replace methods // ----------------------------------------------------------------------- XMLCh* RegularExpression::replace(const char* const matchString, const char* const replaceString){ XMLCh* tmpBuf = XMLString::transcode(matchString, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); XMLCh* tmpBuf2 = XMLString::transcode(replaceString, fMemoryManager); ArrayJanitor janBuf2(tmpBuf2, fMemoryManager); return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf)); } XMLCh* RegularExpression::replace(const char* const matchString, const char* const replaceString, const int start, const int end){ XMLCh* tmpBuf = XMLString::transcode(matchString, fMemoryManager); ArrayJanitor janBuf(tmpBuf, fMemoryManager); XMLCh* tmpBuf2 = XMLString::transcode(replaceString, fMemoryManager); ArrayJanitor janBuf2(tmpBuf2, fMemoryManager); return replace(tmpBuf, tmpBuf2, start, end); } // --------------------------------------------------------------------------- // RegularExpression: Replace methods - Wide char version // --------------------------------------------------------------------------- XMLCh* RegularExpression::replace(const XMLCh* const matchString, const XMLCh* const replaceString){ return replace(matchString, replaceString, 0, XMLString::stringLen(matchString)); } XMLCh* RegularExpression::replace(const XMLCh* const matchString, const XMLCh* const replaceString, const int start, const int end) { //check if matches zero length string - throw error if so if (matches(XMLUni::fgZeroLenString, fMemoryManager)){ ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, fMemoryManager); } RefVectorOf *subEx = new (fMemoryManager) RefVectorOf(10, true, fMemoryManager); Janitor > janSubEx(subEx); //Call to tokenize with Match vector so that we keep track of the locations //of the subExpression within each of the matches RefArrayVectorOf* tokenStack = tokenize(matchString, start, end, subEx); Janitor > janTokStack(tokenStack); XMLBuffer result(1023, fMemoryManager); int numSubEx = 0; if (subEx && subEx->size() > 0) numSubEx = subEx->elementAt(0)->getNoGroups() - 1; int tokStackSize = tokenStack->size(); const XMLCh* curRepString = XMLString::replicate(replaceString, fMemoryManager); for (int i = 0; i < tokStackSize; i++){ result.append(tokenStack->elementAt(i)); if (i != tokStackSize - 1) { //if there are subExpressions, then determine the string we want to //substitute in. if (numSubEx != 0) { fMemoryManager->deallocate((XMLCh*)curRepString); curRepString = subInExp(replaceString, matchString, subEx->elementAt(i)); } result.append(curRepString); } } fMemoryManager->deallocate((XMLCh*)curRepString); return XMLString::replicate(result.getRawBuffer(), fMemoryManager); } // ----------------------------------------------------------------------- // Static initialize and cleanup methods // ----------------------------------------------------------------------- void XMLInitializer::initializeRegularExpression() { RegularExpression::staticInitialize(XMLPlatformUtils::fgMemoryManager); } void RegularExpression::staticInitialize(MemoryManager* memoryManager) { fWordRange = TokenFactory::staticGetRange(fgUniIsWord, false); if (fWordRange == 0) ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, memoryManager); WordRangeCleanup.registerCleanup(localCleanup); } // --------------------------------------------------------------------------- // RegularExpression: Helpers methods // --------------------------------------------------------------------------- int RegularExpression::getOptionValue(const XMLCh ch) { int ret = 0; switch (ch) { case chLatin_i: ret = IGNORE_CASE; break; case chLatin_m: ret = MULTIPLE_LINE; break; case chLatin_s: ret = SINGLE_LINE; break; case chLatin_x: ret = EXTENDED_COMMENT; break; case chLatin_u: ret = USE_UNICODE_CATEGORY; break; case chLatin_w: ret = UNICODE_WORD_BOUNDARY; break; case chLatin_F: ret = PROHIBIT_FIXED_STRING_OPTIMIZATION; break; case chLatin_H: ret = PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; break; case chLatin_X: ret = XMLSCHEMA_MODE; break; case chComma: ret = SPECIAL_COMMA; break; default: break; } return ret; } int RegularExpression::match(Context* const context, const Op* const operations , int offset, const short direction) { const Op* tmpOp = operations; bool ignoreCase = isSet(fOptions, IGNORE_CASE); while (true) { if (tmpOp == 0) break; if (offset > context->fLimit || offset < context->fStart) return -1; switch(tmpOp->getOpType()) { case Op::O_CHAR: if (!matchChar(context, tmpOp->getData(), offset, direction, ignoreCase)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_DOT: if (!matchDot(context, offset, direction)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_RANGE: case Op::O_NRANGE: if (!matchRange(context, tmpOp, offset, direction, ignoreCase)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_ANCHOR: if (!matchAnchor(context, tmpOp->getData(), offset)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_BACKREFERENCE: if (!matchBackReference(context, tmpOp->getData(), offset, direction, ignoreCase)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_STRING: if (!matchString(context, tmpOp->getLiteral(), offset, direction, ignoreCase)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_CLOSURE: { XMLInt32 id = tmpOp->getData(); if (id >= 0) { int prevOffset = context->fOffsets[id]; if (prevOffset < 0 || prevOffset != offset) { context->fOffsets[id] = offset; } else { context->fOffsets[id] = -1; tmpOp = tmpOp->getNextOp(); break; } } int ret = match(context, tmpOp->getChild(), offset, direction); if (id >= 0) { context->fOffsets[id] = -1; } if (ret >= 0) return ret; tmpOp = tmpOp->getNextOp(); } break; case Op::O_QUESTION: { int ret = match(context, tmpOp->getChild(), offset, direction); if (ret >= 0) return ret; tmpOp = tmpOp->getNextOp(); } break; case Op::O_NONGREEDYCLOSURE: case Op::O_NONGREEDYQUESTION: { int ret = match(context,tmpOp->getNextOp(),offset,direction); if (ret >= 0) return ret; tmpOp = tmpOp->getChild(); } break; case Op::O_UNION: { return matchUnion(context, tmpOp, offset, direction); } case Op::O_CAPTURE: if (context->fMatch != 0 && tmpOp->getData() != 0) return matchCapture(context, tmpOp, offset, direction); tmpOp = tmpOp->getNextOp(); break; case Op::O_LOOKAHEAD: if (0 > match(context, tmpOp->getChild(), offset, 1)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_NEGATIVELOOKAHEAD: if (0 <= match(context, tmpOp->getChild(), offset, 1)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_LOOKBEHIND: if (0 > match(context, tmpOp->getChild(), offset, -1)) return - 1; tmpOp = tmpOp->getNextOp(); break; case Op::O_NEGATIVELOOKBEHIND: if (0 <= match(context, tmpOp->getChild(), offset, -1)) return -1; tmpOp = tmpOp->getNextOp(); break; case Op::O_INDEPENDENT: case Op::O_MODIFIER: { int ret = (tmpOp->getOpType() == Op::O_INDEPENDENT) ? match(context, tmpOp->getChild(), offset, direction) : matchModifier(context, tmpOp, offset, direction); if (ret < 0) return ret; offset = ret; tmpOp = tmpOp->getNextOp(); } break; case Op::O_CONDITION: if (tmpOp->getRefNo() >= fNoGroups) return -1; if (matchCondition(context, tmpOp, offset, direction)) tmpOp = tmpOp->getYesFlow(); else if (tmpOp->getNoFlow() != 0) tmpOp = tmpOp->getNoFlow(); else tmpOp = tmpOp->getNextOp(); break; } } return offset; } bool RegularExpression::matchChar(Context* const context, const XMLInt32 ch, int& offset, const short direction, const bool ignoreCase) { int tmpOffset = direction > 0 ? offset : offset - 1; if (tmpOffset >= context->fLimit || tmpOffset < 0) return false; XMLInt32 strCh = 0; if (!context->nextCh(strCh, tmpOffset, direction)) return false; bool match = ignoreCase ? matchIgnoreCase(ch, strCh) : (ch == strCh); if (!match) return false; offset = (direction > 0) ? ++tmpOffset : tmpOffset; return true; } bool RegularExpression::matchDot(Context* const context, int& offset, const short direction) { int tmpOffset = direction > 0 ? offset : offset - 1; if (tmpOffset >= context->fLimit || tmpOffset < 0) return false; XMLInt32 strCh = 0; if (!context->nextCh(strCh, tmpOffset, direction)) return false; if (!isSet(fOptions, SINGLE_LINE)) { if (direction > 0 && RegxUtil::isEOLChar(strCh)) return false; if (direction <= 0 && !RegxUtil::isEOLChar(strCh) ) return false; } offset = (direction > 0) ? ++tmpOffset : tmpOffset; return true; } bool RegularExpression::matchRange(Context* const context, const Op* const op, int& offset, const short direction, const bool ignoreCase) { int tmpOffset = direction > 0 ? offset : offset - 1; if (tmpOffset >= context->fLimit || tmpOffset < 0) return false; XMLInt32 strCh = 0; if (!context->nextCh(strCh, tmpOffset, direction)) return false; RangeToken* tok = (RangeToken *) op->getToken(); bool match = false; if (ignoreCase) { tok = tok->getCaseInsensitiveToken(fTokenFactory); } match = tok->match(strCh); if (!match) return false; offset = (direction > 0) ? ++tmpOffset : tmpOffset; return true; } bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, const int offset) { switch ((XMLCh) ch) { case chLatin_A: if (offset != context->fStart) return false; break; case chLatin_B: if (context->fLength == 0) break; { int after = getWordType(context->fString, context->fStart, context->fLimit, offset); if (after == WT_IGNORE || after == getPreviousWordType(context->fString, context->fStart, context->fLimit, offset)) break; } return false; case chLatin_b: if (context->fLength == 0) return false; { int after = getWordType(context->fString, context->fStart, context->fLimit, offset); if (after == WT_IGNORE || after == getPreviousWordType(context->fString, context->fStart , context->fLimit, offset)) return false; } break; case chLatin_Z: case chDollarSign: if ( (XMLCh) ch == chDollarSign && isSet(fOptions, MULTIPLE_LINE)) { if (!(offset == context->fLimit || (offset < context->fLimit && RegxUtil::isEOLChar(context->fString[offset])))) return false; } else { if (!(offset == context->fLimit || (offset+1 == context->fLimit && RegxUtil::isEOLChar(context->fString[offset])) || (offset+2 == context->fLimit && context->fString[offset] == chCR && context->fString[offset+1] == chLF))) return false; } break; case chLatin_z: if (offset != context->fLimit) return false; break; case chAt: case chCaret: if ( (XMLCh) ch == chCaret && !isSet(fOptions, MULTIPLE_LINE)) { if (offset != context->fStart) return false; } else { if (!(offset == context->fStart || (offset > context->fStart && RegxUtil::isEOLChar(context->fString[offset-1])))) return false; } break; case chOpenAngle: if (context->fLength == 0 || offset == context->fLimit) return false; if (getWordType(context->fString, context->fStart, context->fLimit, offset) != WT_LETTER || getPreviousWordType(context->fString, context->fStart, context->fLimit, offset) != WT_OTHER) return false; break; case chCloseAngle: if (context->fLength == 0 || offset == context->fStart) return false; if (getWordType(context->fString, context->fStart, context->fLimit, offset) != WT_OTHER || getPreviousWordType(context->fString, context->fStart, context->fLimit, offset) != WT_LETTER) return false; break; } return true; } bool RegularExpression::matchBackReference(Context* const context, const XMLInt32 refNo, int& offset, const short direction, const bool ignoreCase) { if (refNo <=0 || refNo >= fNoGroups) ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, fMemoryManager); if (context->fMatch->getStartPos(refNo) < 0 || context->fMatch->getEndPos(refNo) < 0) return false; int start = context->fMatch->getStartPos(refNo); int length = context->fMatch->getEndPos(refNo) - start; int tmpOffset = (direction > 0) ? offset : offset - length; if (context->fLimit - tmpOffset < length) return false; bool match = ignoreCase ? XMLString::regionIMatches(context->fString,tmpOffset, context->fString,start,length) : XMLString::regionMatches(context->fString, tmpOffset, context->fString, start,length); if (!match) return false; offset = (direction > 0) ? offset + length : offset - length; return true; } bool RegularExpression::matchString(Context* const context, const XMLCh* const literal, int& offset, const short direction, const bool ignoreCase) { int length = XMLString::stringLen(literal); int tmpOffset = (direction > 0) ? offset : offset - length; if (context->fLimit - tmpOffset < length) return false; bool match = ignoreCase ? XMLString::regionIMatches(context->fString, tmpOffset, literal, 0, length) : XMLString::regionMatches(context->fString, tmpOffset, literal, 0, length); if (match) { offset = direction > 0 ? offset + length : offset - length; } return match; } int RegularExpression::matchCapture(Context* const context, const Op* const op, int offset, const short direction) { // No check is made for nullness of fMatch as the function is only called if // fMatch is not null. XMLInt32 index = op->getData(); int save = (index > 0) ? context->fMatch->getStartPos(index) : context->fMatch->getEndPos(-index); if (index > 0) { context->fMatch->setStartPos(index, offset); int ret = match(context, op->getNextOp(), offset, direction); if (ret < 0) context->fMatch->setStartPos(index, save); return ret; } context->fMatch->setEndPos(-index, offset); int ret = match(context, op->getNextOp(), offset, direction); if (ret < 0) context->fMatch->setEndPos(-index, save); return ret; } bool RegularExpression::matchCondition(Context* const context, const Op* const op, int offset, const short direction) { int refNo = op->getRefNo(); if ( refNo > 0) return (context->fMatch->getStartPos(refNo) >= 0 && context->fMatch->getEndPos(refNo) >= 0); return (0 <= match(context, op->getConditionFlow(), offset, direction)); } int RegularExpression::parseOptions(const XMLCh* const options) { if (options == 0) return 0; int opts = 0; int length = XMLString::stringLen(options); for (int i=0; i < length; i++) { int v = getOptionValue(options[i]); if (v == 0) ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Regex_UnknownOption, options, fMemoryManager); opts |= v; } return opts; } void RegularExpression::compile(const Token* const token) { if (fOperations != 0) return; fNoClosures = 0; fOperations = compile(token, 0, false); } Op* RegularExpression::compile(const Token* const token, Op* const next, const bool reverse) { Op* ret = 0; const unsigned short tokenType = token->getTokenType(); switch(tokenType) { case Token::T_DOT: case Token::T_CHAR: case Token::T_ANCHOR: case Token::T_RANGE: case Token::T_NRANGE: case Token::T_STRING: case Token::T_BACKREFERENCE: case Token::T_EMPTY: ret = compileSingle(token, next, tokenType); break; case Token::T_CONCAT: ret = compileConcat(token, next, reverse); break; case Token::T_UNION: ret = compileUnion(token, next, reverse); break; case Token::T_CLOSURE: case Token::T_NONGREEDYCLOSURE: ret = compileClosure(token, next, reverse, tokenType); break; case Token::T_PAREN: ret = compileParenthesis(token, next, reverse); break; case Token::T_LOOKAHEAD: case Token::T_NEGATIVELOOKAHEAD: ret = compileLook(token, next, false, tokenType); break; case Token::T_LOOKBEHIND: case Token::T_NEGATIVELOOKBEHIND: ret = compileLook(token, next, true, tokenType); break; case Token::T_INDEPENDENT: case Token::T_MODIFIERGROUP: ret = compileLook(token, next, reverse, tokenType); break; case Token::T_CONDITION: ret = compileCondition(token, next, reverse); break; default: ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_UnknownTokenType, fMemoryManager); break; // this line to be deleted } return ret; } /* * Helper for Replace. This method prepares the replacement string by substituting * in actual values for parenthesized sub expressions. * * An error will be thrown if: * 1) repString references an undefined subExpression * 2) there is an unescaped chDollar which is not followed by a digit * */ const XMLCh* RegularExpression::subInExp(const XMLCh* const repString, const XMLCh* const origString, const Match* subEx){ int numSubExp = subEx->getNoGroups() - 1; if (numSubExp == 0) return XMLString::replicate(repString, fMemoryManager); bool notEscaped = true; XMLBuffer newString(1023, fMemoryManager); XMLCh indexStr[2]; //holds the string rep of a indexStr[1] = chNull; int index = -1; for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){ if ((*ptr == chDollarSign) && notEscaped) { ptr++; //check that after the $ is a digit if (!XMLString::isDigit(*ptr)){ //invalid replace string - $ must be followed by a digit ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager); } indexStr[0] = *ptr; //get the digit index = XMLString::parseInt(indexStr, fMemoryManager); //convert it to an int //now check that the index is legal if (index > numSubExp){ ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager); } int start = subEx->getStartPos(index); int end = subEx->getEndPos(index); //now copy the substring into the new string for (int i=start; igetMinLength(); fFirstChar = 0; if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) && !isSet(fOptions, XMLSCHEMA_MODE)) { RangeToken* rangeTok = fTokenFactory->createRange(); int result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory); if (result == Token::FC_TERMINAL) { rangeTok->compactRanges(); fFirstChar = rangeTok; } rangeTok->createMap(); if (isSet(fOptions, IGNORE_CASE)) { rangeTok->getCaseInsensitiveToken(fTokenFactory); } } if (fOperations != 0 && fOperations->getNextOp() == 0 && (fOperations->getOpType() == Op::O_STRING || fOperations->getOpType() == Op::O_CHAR) ) { fFixedStringOnly = true; if (fOperations->getOpType() == Op::O_STRING) { fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager); } else{ XMLInt32 ch = fOperations->getData(); if ( ch >= 0x10000) { // add as constant fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager); } else { XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2]; dummyStr[0] = (XMLCh) fOperations->getData(); dummyStr[1] = chNull; fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = dummyStr; } } fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, isSet(fOptions, IGNORE_CASE), fMemoryManager); } else if (!isSet(fOptions, XMLSCHEMA_MODE) && !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION)) { int fixedOpts = 0; Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts); fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = (tok == 0) ? 0 : XMLString::replicate(tok->getString(), fMemoryManager); if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) { fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = 0; } if (fFixedString != 0) { fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, isSet(fixedOpts, IGNORE_CASE), fMemoryManager); } } } unsigned short RegularExpression::getCharType(const XMLCh ch) { if (!isSet(fOptions, UNICODE_WORD_BOUNDARY)) { if (isSet(fOptions, USE_UNICODE_CATEGORY)) { if (fWordRange == 0) { fWordRange = fTokenFactory->getRange(fgUniIsWord); if (fWordRange == 0) ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, fMemoryManager); } return fWordRange->match(ch) ? WT_LETTER : WT_OTHER; } return RegxUtil::isWordChar(ch); } switch (XMLUniCharacter::getType(ch)) { case XMLUniCharacter::UPPERCASE_LETTER: case XMLUniCharacter::LOWERCASE_LETTER: case XMLUniCharacter::TITLECASE_LETTER: case XMLUniCharacter::MODIFIER_LETTER: case XMLUniCharacter::OTHER_LETTER: case XMLUniCharacter::LETTER_NUMBER: case XMLUniCharacter::DECIMAL_DIGIT_NUMBER: case XMLUniCharacter::OTHER_NUMBER: case XMLUniCharacter::COMBINING_SPACING_MARK: return WT_LETTER; case XMLUniCharacter::FORMAT: case XMLUniCharacter::NON_SPACING_MARK: case XMLUniCharacter::ENCLOSING_MARK: return WT_IGNORE; case XMLUniCharacter::CONTROL: switch (ch) { case chHTab: case chLF: case chVTab: case chFF: case chCR: return WT_OTHER; default: return WT_IGNORE; } } return WT_OTHER; } XERCES_CPP_NAMESPACE_END /** * End of file RegularExpression.cpp */