From faa6ee0063a2d3aa7833e3877dec5e50a3bc0127 Mon Sep 17 00:00:00 2001 From: johnjamesmccann <98098904+johnjamesmccann@users.noreply.github.com> Date: Thu, 20 Jan 2022 14:39:09 +0000 Subject: [PATCH 1/5] DTD hot fix SPDX-FileCopyrightText: Portions Copyright 2021 Siemens Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. --- DGXMLScanner.cpp | 3578 ++++++++++++++++++++++++++++++++++++++++++++++ IGXMLScanner.cpp | 3275 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 6853 insertions(+) create mode 100644 DGXMLScanner.cpp create mode 100644 IGXMLScanner.cpp diff --git a/DGXMLScanner.cpp b/DGXMLScanner.cpp new file mode 100644 index 000000000..ae8076d5b --- /dev/null +++ b/DGXMLScanner.cpp @@ -0,0 +1,3578 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * $Id$ + */ + +// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens +// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. + +// --------------------------------------------------------------------------- +// Includes +// --------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace XERCES_CPP_NAMESPACE { + + +typedef JanitorMemFunCall CleanupType; +typedef JanitorMemFunCall ReaderMgrResetType; + + +// --------------------------------------------------------------------------- +// DGXMLScanner: Constructors and Destructor +// --------------------------------------------------------------------------- +DGXMLScanner::DGXMLScanner(XMLValidator* const valToAdopt + , GrammarResolver* const grammarResolver + , MemoryManager* const manager) : + + XMLScanner(valToAdopt, grammarResolver, manager) + , fAttrNSList(0) + , fDTDValidator(0) + , fDTDGrammar(0) + , fDTDElemNonDeclPool(0) + , fElemCount(0) + , fAttDefRegistry(0) + , fUndeclaredAttrRegistry(0) +{ + CleanupType cleanup(this, &DGXMLScanner::cleanUp); + + try + { + commonInit(); + } + catch(const OutOfMemoryException&) + { + // Don't cleanup when out of memory, since executing the + // code can cause problems. + cleanup.release(); + + throw; + } + + cleanup.release(); +} + +DGXMLScanner::DGXMLScanner( XMLDocumentHandler* const docHandler + , DocTypeHandler* const docTypeHandler + , XMLEntityHandler* const entityHandler + , XMLErrorReporter* const errHandler + , XMLValidator* const valToAdopt + , GrammarResolver* const grammarResolver + , MemoryManager* const manager) : + + XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) + , fAttrNSList(0) + , fDTDValidator(0) + , fDTDGrammar(0) + , fDTDElemNonDeclPool(0) + , fElemCount(0) + , fAttDefRegistry(0) + , fUndeclaredAttrRegistry(0) +{ + CleanupType cleanup(this, &DGXMLScanner::cleanUp); + + try + { + commonInit(); + } + catch(const OutOfMemoryException&) + { + // Don't cleanup when out of memory, since executing the + // code can cause problems. + cleanup.release(); + + throw; + } + + cleanup.release(); +} + +DGXMLScanner::~DGXMLScanner() +{ + cleanUp(); +} + +// --------------------------------------------------------------------------- +// XMLScanner: Getter methods +// --------------------------------------------------------------------------- +NameIdPool* DGXMLScanner::getEntityDeclPool() +{ + if(!fGrammar) + return 0; + return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); +} + +const NameIdPool* DGXMLScanner::getEntityDeclPool() const +{ + if(!fGrammar) + return 0; + return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); +} + +// --------------------------------------------------------------------------- +// DGXMLScanner: Main entry point to scan a document +// --------------------------------------------------------------------------- +void DGXMLScanner::scanDocument(const InputSource& src) +{ + // Bump up the sequence id for this parser instance. This will invalidate + // any previous progressive scan tokens. + fSequenceId++; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + try + { + // Reset the scanner and its plugged in stuff for a new run. This + // resets all the data structures, creates the initial reader and + // pushes it on the stack, and sets up the base document path. + scanReset(src); + + // If we have a document handler, then call the start document + if (fDocHandler) + fDocHandler->startDocument(); + + // Scan the prolog part, which is everything before the root element + // including the DTD subsets. + scanProlog(); + + // If we got to the end of input, then its not a valid XML file. + // Else, go on to scan the content. + if (fReaderMgr.atEOF()) + { + emitError(XMLErrs::EmptyMainEntity); + } + else + { + // Scan content, and tell it its not an external entity + if (scanContent()) + { + // Do post-parse validation if required + if (fValidate) + { + // We handle ID reference semantics at this level since + // its required by XML 1.0. + checkIDRefs(); + + // Then allow the validator to do any extra stuff it wants +// fValidator->postParseValidation(); + } + + // That went ok, so scan for any miscellaneous stuff + if (!fReaderMgr.atEOF()) + scanMiscellaneous(); + } + } + + // If we have a document handler, then call the end document + if (fDocHandler) + fDocHandler->endDocument(); + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first failure' exception, so fall through + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so fall through + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } +} + + +bool DGXMLScanner::scanNext(XMLPScanToken& token) +{ + // Make sure this token is still legal + if (!isLegalToken(token)) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); + + // Find the next token and remember the reader id + XMLSize_t orgReader; + XMLTokens curToken; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + bool retVal = true; + + try + { + while (true) + { + // We have to handle any end of entity exceptions that happen here. + // We could be at the end of X nested entities, each of which will + // generate an end of entity exception as we try to move forward. + try + { + curToken = senseNextToken(orgReader); + break; + } + catch(const EndOfEntityException& toCatch) + { + // Send an end of entity reference event + if (fDocHandler) + fDocHandler->endEntityReference(toCatch.getEntity()); + } + } + + if (curToken == Token_CharData) + { + scanCharData(fCDataBuf); + } + else if (curToken == Token_EOF) + { + if (!fElemStack.isEmpty()) + { + const ElemStack::StackElem* topElem = fElemStack.popTop(); + emitError + ( + XMLErrs::EndedWithTagsOnStack + , topElem->fThisElement->getFullName() + ); + } + + retVal = false; + } + else + { + // Its some sort of markup + bool gotData = true; + switch(curToken) + { + case Token_CData : + // Make sure we are within content + if (fElemStack.isEmpty()) + emitError(XMLErrs::CDATAOutsideOfContent); + scanCDSection(); + break; + + case Token_Comment : + scanComment(); + break; + + case Token_EndTag : + scanEndTag(gotData); + break; + + case Token_PI : + scanPI(); + break; + + case Token_StartTag : + if (fDoNamespaces) + scanStartTagNS(gotData); + else + scanStartTag(gotData); + break; + + default : + fReaderMgr.skipToChar(chOpenAngle); + break; + } + + if (orgReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + // If we hit the end, then do the miscellaneous part + if (!gotData) + { + // Do post-parse validation if required + if (fValidate) + { + // We handle ID reference semantics at this level since + // its required by XML 1.0. + checkIDRefs(); + + // Then allow the validator to do any extra stuff it wants +// fValidator->postParseValidation(); + } + + // That went ok, so scan for any miscellaneous stuff + scanMiscellaneous(); + + if (fDocHandler) + fDocHandler->endDocument(); + } + } + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first failure' exception, so return failure + retVal = false; + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so return failure + retVal = false; + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + retVal = false; + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + // If we are not at the end, release the object that will + // reset the ReaderMgr. + if (retVal) + resetReaderMgr.release(); + + return retVal; +} + + +// --------------------------------------------------------------------------- +// DGXMLScanner: Private scanning methods +// --------------------------------------------------------------------------- + +// This method will kick off the scanning of the primary content of the +// document, i.e. the elements. +bool DGXMLScanner::scanContent() +{ + // Go into a loop until we hit the end of the root element, or we fall + // out because there is no root element. + // + // We have to do kind of a deeply nested double loop here in order to + // avoid doing the setup/teardown of the exception handler on each + // round. Doing it this way we only do it when an exception actually + // occurs. + bool gotData = true; + bool inMarkup = false; + while (gotData) + { + try + { + while (gotData) + { + // Sense what the next top level token is. According to what + // this tells us, we will call something to handle that kind + // of thing. + XMLSize_t orgReader; + const XMLTokens curToken = senseNextToken(orgReader); + + // Handle character data and end of file specially. Char data + // is not markup so we don't want to handle it in the loop + // below. + if (curToken == Token_CharData) + { + // Scan the character data and call appropriate events. Let + // him use our local character data buffer for efficiency. + scanCharData(fCDataBuf); + continue; + } + else if (curToken == Token_EOF) + { + // The element stack better be empty at this point or we + // ended prematurely before all elements were closed. + if (!fElemStack.isEmpty()) + { + const ElemStack::StackElem* topElem = fElemStack.popTop(); + emitError + ( + XMLErrs::EndedWithTagsOnStack + , topElem->fThisElement->getFullName() + ); + } + + // Its the end of file, so clear the got data flag + gotData = false; + continue; + } + + // We are in some sort of markup now + inMarkup = true; + + // According to the token we got, call the appropriate + // scanning method. + switch(curToken) + { + case Token_CData : + // Make sure we are within content + if (fElemStack.isEmpty()) + emitError(XMLErrs::CDATAOutsideOfContent); + scanCDSection(); + break; + + case Token_Comment : + scanComment(); + break; + + case Token_EndTag : + scanEndTag(gotData); + break; + + case Token_PI : + scanPI(); + break; + + case Token_StartTag : + if (fDoNamespaces) + scanStartTagNS(gotData); + else + scanStartTag(gotData); + break; + + default : + fReaderMgr.skipToChar(chOpenAngle); + break; + } + + if (orgReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + // And we are back out of markup again + inMarkup = false; + } + } + catch(const EndOfEntityException& toCatch) + { + // If we were in some markup when this happened, then its a + // partial markup error. + if (inMarkup) + emitError(XMLErrs::PartialMarkupInEntity); + + // Send an end of entity reference event + if (fDocHandler) + fDocHandler->endEntityReference(toCatch.getEntity()); + + inMarkup = false; + } + } + + // It went ok, so return success + return true; +} + + +void DGXMLScanner::scanEndTag(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the end of the root element. + gotData = true; + + // Check if the element stack is empty. If so, then this is an unbalanced + // element (i.e. more ends than starts, perhaps because of bad text + // causing one to be skipped.) + if (fElemStack.isEmpty()) + { + emitError(XMLErrs::MoreEndThanStartTags); + fReaderMgr.skipPastChar(chCloseAngle); + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); + } + + // Pop the stack of the element we are supposed to be ending. Remember + // that we don't own this. The stack just keeps them and reuses them. + unsigned int uriId = (fDoNamespaces) + ? fElemStack.getCurrentURI() : fEmptyNamespaceId; + + // Pop the stack of the element we are supposed to be ending. Remember + // that we don't own this. The stack just keeps them and reuses them. + const ElemStack::StackElem* topElem = fElemStack.popTop(); + XMLElementDecl *tempElement = topElem->fThisElement; + + // See if it was the root element, to avoid multiple calls below + const bool isRoot = fElemStack.isEmpty(); + + // Make sure that its the end of the element that we expect + if (!fReaderMgr.skippedStringLong(tempElement->getFullName())) + { + emitError + ( + XMLErrs::ExpectedEndOfTagX + , tempElement->getFullName() + ); + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Make sure we are back on the same reader as where we started + if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialTagMarkupError); + + // Skip optional whitespace + fReaderMgr.skipPastSpaces(); + + // Make sure we find the closing bracket + if (!fReaderMgr.skippedChar(chCloseAngle)) + { + emitError + ( + XMLErrs::UnterminatedEndTag + , topElem->fThisElement->getFullName() + ); + } + + // If validation is enabled, then lets pass him the list of children and + // this element and let him validate it. + if (fValidate) + { + + // + // XML1.0-3rd + // Validity Constraint: + // The declaration matches EMPTY and the element has no content (not even + // entity references, comments, PIs or white space). + // + if ( (topElem->fCommentOrPISeen) && + (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty)) + { + fValidator->emitError + ( + XMLValid::EmptyElemHasContent + , topElem->fThisElement->getFullName() + ); + } + + // + // XML1.0-3rd + // Validity Constraint: + // + // The declaration matches children and the sequence of child elements + // belongs to the language generated by the regular expression in the + // content model, with optional white space, comments and PIs + // (i.e. markup matching production [27] Misc) between the start-tag and + // the first child element, between child elements, or between the last + // child element and the end-tag. + // + // Note that + // a CDATA section containing only white space or + // a reference to an entity whose replacement text is character references + // expanding to white space do not match the nonterminal S, and hence + // cannot appear in these positions; however, + // a reference to an internal entity with a literal value consisting + // of character references expanding to white space does match S, + // since its replacement text is the white space resulting from expansion + // of the character references. + // + if ( (topElem->fReferenceEscaped) && + (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children)) + { + fValidator->emitError + ( + XMLValid::ElemChildrenHasInvalidWS + , topElem->fThisElement->getFullName() + ); + } + + XMLSize_t failure; + bool res = fValidator->checkContent + ( + topElem->fThisElement + , topElem->fChildren + , topElem->fChildCount + , &failure + ); + + if (!res) + { + // One of the elements is not valid for the content. NOTE that + // if no children were provided but the content model requires + // them, it comes back with a zero value. But we cannot use that + // to index the child array in this case, and have to put out a + // special message. + if (!topElem->fChildCount) + { + fValidator->emitError + ( + XMLValid::EmptyNotValidForContent + , topElem->fThisElement->getFormattedContentModel() + ); + } + else if (failure >= topElem->fChildCount) + { + fValidator->emitError + ( + XMLValid::NotEnoughElemsForCM + , topElem->fThisElement->getFormattedContentModel() + ); + } + else + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , topElem->fChildren[failure]->getRawName() + , topElem->fThisElement->getFormattedContentModel() + ); + } + } + } + + // If we have a doc handler, tell it about the end tag + if (fDocHandler) + { + fDocHandler->endElement + ( + *topElem->fThisElement + , uriId + , isRoot + , (fDoNamespaces) + ? topElem->fThisElement->getElementName()->getPrefix() + : XMLUni::fgZeroLenString + ); + } + + // If this was the root, then done with content + gotData = !isRoot; +} + + +// This method handles the high level logic of scanning the DOCType +// declaration. This calls the DTDScanner and kicks off both the scanning of +// the internal subset and the scanning of the external subset, if any. +// +// When we get here the 'resetDocType(); + + // There must be some space after DOCTYPE + bool skippedSomething; + fReaderMgr.skipPastSpaces(skippedSomething); + if (!skippedSomething) + { + emitError(XMLErrs::ExpectedWhitespace); + + // Just skip the Doctype declaration and return + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Get a buffer for the root element + XMLBufBid bbRootName(&fBufMgr); + + // Get a name from the input, which should be the name of the root + // element of the upcoming content. + int colonPosition; + bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) : + fReaderMgr.getName(bbRootName.getBuffer()); + if (!validName) + { + if (bbRootName.isEmpty()) + emitError(XMLErrs::NoRootElemInDOCTYPE); + else + emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer()); + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Store the root element name for later check + setRootElemName(bbRootName.getRawBuffer()); + + // This element obviously is not going to exist in the element decl + // pool yet, but we need to call docTypeDecl. So force it into + // the element decl pool, marked as being there because it was in + // the DOCTYPE. Later, when its declared, the status will be updated. + // + // Only do this if we are not reusing the validator! If we are reusing, + // then look it up instead. It has to exist! + MemoryManager* const rootDeclMgr = + fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager; + + DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl + ( + bbRootName.getRawBuffer() + , fEmptyNamespaceId + , DTDElementDecl::Any + , rootDeclMgr + ); + + Janitor rootDeclJanitor(rootDecl); + rootDecl->setCreateReason(DTDElementDecl::AsRootElem); + rootDecl->setExternalElemDeclaration(true); + if(!fUseCachedGrammar) + { + fGrammar->putElemDecl(rootDecl); + rootDeclJanitor.release(); + } else + { + // put this in the undeclared pool so it gets deleted... + XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer()); + if (elemDecl) + { + rootDecl->setId(elemDecl->getId()); + } + else + { + rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl)); + rootDeclJanitor.release(); + } + } + + // Skip any spaces after the name + fReaderMgr.skipPastSpaces(); + + // And now if we are looking at a >, then we are done. It is not + // required to have an internal or external subset, though why you + // would not escapes me. + if (fReaderMgr.skippedChar(chCloseAngle)) { + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) + fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false); + return; + } + + // either internal/external subset + if (fValScheme == Val_Auto && !fValidate) + fValidate = true; + + bool hasIntSubset = false; + bool hasExtSubset = false; + XMLCh* sysId = 0; + XMLCh* pubId = 0; + + DTDScanner dtdScanner + ( + (DTDGrammar*) fGrammar + , fDocTypeHandler + , fGrammarPoolMemoryManager + , fMemoryManager + ); + dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); + + // If the next character is '[' then we have no external subset cause + // there is no system id, just the opening character of the internal + // subset. Else, has to be an id. + // + // Just look at the next char, don't eat it. + if (fReaderMgr.peekNextChar() == chOpenSquare) + { + hasIntSubset = true; + } + else + { + // Indicate we have an external subset + hasExtSubset = true; + fHasNoDTD = false; + + // Get buffers for the ids + XMLBufBid bbPubId(&fBufMgr); + XMLBufBid bbSysId(&fBufMgr); + + // Get the external subset id + if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External)) + { + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Get copies of the ids we got + pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager); + sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager); + + // Skip spaces and check again for the opening of an internal subset + fReaderMgr.skipPastSpaces(); + + // Just look at the next char, don't eat it. + if (fReaderMgr.peekNextChar() == chOpenSquare) { + hasIntSubset = true; + } + } + + // Insure that the ids get cleaned up, if they got allocated + ArrayJanitor janSysId(sysId, fMemoryManager); + ArrayJanitor janPubId(pubId, fMemoryManager); + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) + fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset); + + // Ok, if we had an internal subset, we are just past the [ character + // and need to parse that first. + if (hasIntSubset) + { + // Eat the opening square bracket + fReaderMgr.getNextChar(); + + checkInternalDTD(hasExtSubset, sysId, pubId); + + // And try to scan the internal subset. If we fail, try to recover + // by skipping forward tot he close angle and returning. + if (!dtdScanner.scanInternalSubset()) + { + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Do a sanity check that some expanded PE did not propogate out of + // the doctype. This could happen if it was terminated early by bad + // syntax. + if (fReaderMgr.getReaderDepth() > 1) + { + emitError(XMLErrs::PEPropogated); + + // Ask the reader manager to pop back down to the main level + fReaderMgr.cleanStackBackTo(1); + } + + fReaderMgr.skipPastSpaces(); + } + + // And that should leave us at the closing > of the DOCTYPE line + if (!fReaderMgr.skippedChar(chCloseAngle)) + { + // Do a special check for the common scenario of an extra ] char at + // the end. This is easy to recover from. + if (fReaderMgr.skippedChar(chCloseSquare) + && fReaderMgr.skippedChar(chCloseAngle)) + { + emitError(XMLErrs::ExtraCloseSquare); + } + else + { + emitError(XMLErrs::UnterminatedDOCTYPE); + fReaderMgr.skipPastChar(chCloseAngle); + } + } + + // If we had an external subset, then we need to deal with that one + // next. If we are reusing the validator, then don't scan it. + if (hasExtSubset) { + + InputSource* srcUsed=0; + Janitor janSrc(srcUsed); + // If we had an internal subset and we're using the cached grammar, it + // means that the ignoreCachedDTD is set, so we ignore the cached + // grammar + if (fUseCachedGrammar && !hasIntSubset) + { + srcUsed = resolveSystemId(sysId, pubId); + if (srcUsed) { + janSrc.reset(srcUsed); + Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId()); + + if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) { + + fDTDGrammar = (DTDGrammar*) grammar; + fGrammar = fDTDGrammar; + fValidator->setGrammar(fGrammar); + // If we don't report at least the external subset boundaries, + // an advanced document handler cannot know when the DTD end, + // since we've already sent a doctype decl that indicates there's + // there's an external subset. + if (fDocTypeHandler) + { + fDocTypeHandler->startExtSubset(); + fDocTypeHandler->endExtSubset(); + } + + return; + } + } + } + + if (fLoadExternalDTD || fValidate) + { + // And now create a reader to read this entity + XMLReader* reader; + if(srcUsed) { + reader = fReaderMgr.createReader + ( + *srcUsed + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , fCalculateSrcOfs + , fLowWaterMark + ); + } + else { + reader = fReaderMgr.createReader + ( + sysId + , pubId + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , srcUsed + , fCalculateSrcOfs + , fLowWaterMark + , fDisableDefaultEntityResolution + ); + janSrc.reset(srcUsed); + } + // If it failed then throw an exception + if (!reader) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager); + + if (fToCacheGrammar) { + + unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId()); + const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId); + + fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); + ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); + fGrammarResolver->putGrammar(fGrammar); + } + + // In order to make the processing work consistently, we have to + // make this look like an external entity. So create an entity + // decl and fill it in and push it with the reader, as happens + // with an external entity. Put a janitor on it to insure it gets + // cleaned up. The reader manager does not adopt them. + const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; + DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); + declDTD->setSystemId(sysId); + declDTD->setIsExternal(true); + + // Mark this one as a throw at end + reader->setThrowAtEnd(true); + + // And push it onto the stack, with its pseudo name + fReaderMgr.pushReader(reader, declDTD); + + // Tell it its not in an include section + dtdScanner.scanExtSubsetDecl(false, true); + } + } +} + +bool DGXMLScanner::scanStartTag(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the root and its empty. + gotData = true; + + // Get the QName. In this case, we are not doing namespaces, so we just + // use it as is and don't have to break it into parts. + + bool validName = fReaderMgr.getName(fQNameBuf); + if (!validName) + { + if (fQNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedElementName); + else + emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); + fReaderMgr.skipToChar(chOpenAngle); + return false; + } + + // Assume it won't be an empty tag + bool isEmpty = false; + + // See if its the root element + const bool isRoot = fElemStack.isEmpty(); + + // Lets try to look up the element in the validator's element decl pool + // We can pass bogus values for the URI id and the base name. We know that + // this can only be called if we are doing a DTD style validator and that + // he will only look at the QName. + // + // We *do not* tell him to fault in a decl if he does not find one - NG. + bool wasAdded = false; + const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); + + XMLElementDecl* elemDecl = fGrammar->getElemDecl + ( + fEmptyNamespaceId + , 0 + , qnameRawBuf + , Grammar::TOP_LEVEL_SCOPE + ); + // look in the undeclared pool: + if(!elemDecl) + { + elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); + } + if(!elemDecl) + { + wasAdded = true; + elemDecl = new (fMemoryManager) DTDElementDecl + ( + qnameRawBuf + , fEmptyNamespaceId + , DTDElementDecl::Any + , fMemoryManager + ); + elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); + } + + if (fValidate) { + + if (wasAdded) + { + // This is to tell the reuse Validator that this element was + // faulted-in, was not an element in the validator pool originally + elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); + + fValidator->emitError + ( + XMLValid::ElementNotDefined + , qnameRawBuf + ); + } + // If its not marked declared, then emit an error + else if (!elemDecl->isDeclared()) + { + fValidator->emitError + ( + XMLValid::ElementNotDefined + , qnameRawBuf + ); + } + + + fValidator->validateElement(elemDecl); + } + + // Expand the element stack and add the new element + fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); + + // If this is the first element and we are validating, check the root + // element. + if (isRoot) + { + fRootGrammar = fGrammar; + + if (fValidate) + { + // If a DocType exists, then check if it matches the root name there. + if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) + fValidator->emitError(XMLValid::RootElemNotLikeDocType); + } + } + else if (fValidate) + { + // If the element stack is not empty, then add this element as a + // child of the previous top element. If its empty, this is the root + // elem and is not the child of anything. + fElemStack.addChild(elemDecl->getElementName(), true); + } + + // Skip any whitespace after the name + fReaderMgr.skipPastSpaces(); + + // We loop until we either see a /> or >, handling attribute/value + // pairs until we get there. + XMLSize_t attCount = 0; + XMLSize_t curAttListSize = fAttrList->size(); + wasAdded = false; + + fElemCount++; + + while (true) + { + // And get the next non-space character + XMLCh nextCh = fReaderMgr.peekNextChar(); + + // If the next character is not a slash or closed angle bracket, + // then it must be whitespace, since whitespace is required + // between the end of the last attribute and the name of the next + // one. + if (attCount) + { + if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) + { + if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) + { + // Ok, skip by them and peek another char + fReaderMgr.skipPastSpaces(); + nextCh = fReaderMgr.peekNextChar(); + } + else + { + // Emit the error but keep on going + emitError(XMLErrs::ExpectedWhitespace); + } + } + } + + // Ok, here we first check for any of the special case characters. + // If its not one, then we do the normal case processing, which + // assumes that we've hit an attribute value, Otherwise, we do all + // the special case checks. + if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) + { + // Assume its going to be an attribute, so get a name from + // the input. + + validName = fReaderMgr.getName(fAttNameBuf); + if (!validName) + { + if (fAttNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedAttrName); + else + emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); + fReaderMgr.skipPastChar(chCloseAngle); + return false; + } + + // And next must be an equal sign + if (!scanEq()) + { + static const XMLCh tmpList[] = + { + chSingleQuote, chDoubleQuote, chCloseAngle + , chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedEqSign); + + // Try to sync back up by skipping forward until we either + // hit something meaningful. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) + { + // Jump back to top for normal processing of these + continue; + } + else if ((chFound == chSingleQuote) + || (chFound == chDoubleQuote) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through assuming that the value is to follow + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + + // See if this attribute is declared for this element. If we are + // not validating of course it will not be at first, but we will + // fault it into the pool (to avoid lots of redundant errors.) + XMLCh * namePtr = fAttNameBuf.getRawBuffer(); + XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); + + // Skip any whitespace before the value and then scan the att + // value. This will come back normalized with entity refs and + // char refs expanded. + fReaderMgr.skipPastSpaces(); + if (!scanAttValue(attDef, namePtr, fAttValueBuf)) + { + static const XMLCh tmpList[] = + { + chCloseAngle, chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedAttrValue); + + // It failed, so lets try to get synced back up. We skip + // forward until we find some whitespace or one of the + // chars in our list. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) + || (chFound == chForwardSlash) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through and process this attribute, though + // the value will be "". + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + + // Add this attribute to the attribute list that we use to + // pass them to the handler. We reuse its existing elements + // but expand it as required. + // Note that we want to this first since this will + // make a copy of the namePtr; we can then make use of + // that copy in the hashtable lookup that checks + // for duplicates. This will mean we may have to update + // the type of the XMLAttr later. + XMLAttr* curAtt; + const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); + + if (attCount >= curAttListSize) { + curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); + fAttrList->addElement(curAtt); + } + else { + curAtt = fAttrList->elementAt(attCount); + } + + curAtt->setSpecified(true); + + // NO NAMESPACE CODE + { + curAtt->set( + 0, namePtr, XMLUni::fgZeroLenString, XMLUni::fgZeroLenString + , (attDef)?attDef->getType():XMLAttDef::CData + ); + + // now need to prepare for duplicate detection + if (attDef) { + unsigned int *curCountPtr = fAttDefRegistry->get(attDef); + if (!curCountPtr) { + curCountPtr = getNewUIntPtr(); + *curCountPtr = fElemCount; + fAttDefRegistry->put(attDef, curCountPtr); + } + else if (*curCountPtr < fElemCount) { + *curCountPtr = fElemCount; + } + else { + emitError( + XMLErrs::AttrAlreadyUsedInSTag + , attDef->getFullName(), elemDecl->getFullName() + ); + } + } + else + { + // reset namePtr so it refers to newly-allocated memory + namePtr = (XMLCh *)curAtt->getQName(); + if (!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0)) + { + emitError( + XMLErrs::AttrAlreadyUsedInSTag + , namePtr, elemDecl->getFullName() + ); + } + } + } + + if (fValidate) + { + if (attDef) { + // Let the validator pass judgement on the attribute value + fValidator->validateAttrValue( + attDef, fAttValueBuf.getRawBuffer(), false, elemDecl + ); + } + else + { + fValidator->emitError + ( + XMLValid::AttNotDefinedForElement + , fAttNameBuf.getRawBuffer(), qnameRawBuf + ); + } + } + + // must set the newly-minted value on the XMLAttr: + curAtt->setValue(attrValue); + attCount++; + + // And jump back to the top of the loop + continue; + } + + // It was some special case character so do all of the checks and + // deal with it. + if (!nextCh) + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + + if (nextCh == chForwardSlash) + { + fReaderMgr.getNextChar(); + isEmpty = true; + if (!fReaderMgr.skippedChar(chCloseAngle)) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if (nextCh == chCloseAngle) + { + fReaderMgr.getNextChar(); + break; + } + else if (nextCh == chOpenAngle) + { + // Check for this one specially, since its going to be common + // and it is kind of auto-recovering since we've already hit the + // next open bracket, which is what we would have seeked to (and + // skipped this whole tag.) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) + { + // Check for this one specially, which is probably a missing + // attribute name, e.g. ="value". Just issue expected name + // error and eat the quoted string, then jump back to the + // top again. + emitError(XMLErrs::ExpectedAttrName); + fReaderMgr.getNextChar(); + fReaderMgr.skipQuotedString(nextCh); + fReaderMgr.skipPastSpaces(); + continue; + } + } + + if(attCount) + { + // clean up after ourselves: + // clear the map used to detect duplicate attributes + fUndeclaredAttrRegistry->removeAll(); + } + + // Now lets get the fAttrList filled in. This involves faulting in any + // defaulted and fixed attributes and normalizing the values of any that + // we got explicitly. + // + // We update the attCount value with the total number of attributes, but + // it goes in with the number of values we got during the raw scan of + // explictly provided attrs above. + attCount = buildAttList(attCount, elemDecl, *fAttrList); + + // If we have a document handler, then tell it about this start tag. We + // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send + // any prefix since its just one big name if we are not doing namespaces. + unsigned int uriId = fEmptyNamespaceId; + if (fDocHandler) + { + fDocHandler->startElement + ( + *elemDecl + , uriId + , 0 + , *fAttrList + , attCount + , isEmpty + , isRoot + ); + } + + // If empty, validate content right now if we are validating and then + // pop the element stack top. Else, we have to update the current stack + // top's namespace mapping elements. + if (isEmpty) + { + // If validating, then insure that its legal to have no content + if (fValidate) + { + XMLSize_t failure; + bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); + if (!res) + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , qnameRawBuf + , elemDecl->getFormattedContentModel() + ); + } + } + + // Pop the element stack back off since it'll never be used now + fElemStack.popTop(); + + // If the elem stack is empty, then it was an empty root + if (isRoot) + gotData = false; + } + + return true; +} + + +bool DGXMLScanner::scanStartTagNS(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the root and its empty. + gotData = true; + + // Get the QName. In this case, we are not doing namespaces, so we just + // use it as is and don't have to break it into parts. + + int colonPosition; + bool validName = fReaderMgr.getQName(fQNameBuf, &colonPosition); + if (!validName) + { + if (fQNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedElementName); + else + emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); + fReaderMgr.skipToChar(chOpenAngle); + return false; + } + + // Assume it won't be an empty tag + bool isEmpty = false; + + // See if its the root element + const bool isRoot = fElemStack.isEmpty(); + + // Lets try to look up the element in the validator's element decl pool + // We can pass bogus values for the URI id and the base name. We know that + // this can only be called if we are doing a DTD style validator and that + // he will only look at the QName. + // + // We *do not* tell him to fault in a decl if he does not find one - NG. + bool wasAdded = false; + const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); + + XMLElementDecl* elemDecl = fGrammar->getElemDecl + ( + fEmptyNamespaceId + , 0 + , qnameRawBuf + , Grammar::TOP_LEVEL_SCOPE + ); + // look in the undeclared pool: + if(!elemDecl) + { + elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); + } + if(!elemDecl) + { + wasAdded = true; + elemDecl = new (fMemoryManager) DTDElementDecl + ( + qnameRawBuf + , fEmptyNamespaceId + , DTDElementDecl::Any + , fMemoryManager + ); + elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); + } + + if (fValidate) { + + if (wasAdded) + { + // This is to tell the reuse Validator that this element was + // faulted-in, was not an element in the validator pool originally + elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); + + fValidator->emitError + ( + XMLValid::ElementNotDefined + , qnameRawBuf + ); + } + // If its not marked declared, then emit an error + else if (!elemDecl->isDeclared()) + { + fValidator->emitError + ( + XMLValid::ElementNotDefined + , qnameRawBuf + ); + } + + + fValidator->validateElement(elemDecl); + } + + // Expand the element stack and add the new element + fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); + + // If this is the first element and we are validating, check the root + // element. + if (isRoot) + { + fRootGrammar = fGrammar; + + if (fValidate) + { + // If a DocType exists, then check if it matches the root name there. + if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) + fValidator->emitError(XMLValid::RootElemNotLikeDocType); + } + } + else if (fValidate) + { + // If the element stack is not empty, then add this element as a + // child of the previous top element. If its empty, this is the root + // elem and is not the child of anything. + fElemStack.addChild(elemDecl->getElementName(), true); + } + + // Skip any whitespace after the name + fReaderMgr.skipPastSpaces(); + + // We loop until we either see a /> or >, handling attribute/value + // pairs until we get there. + XMLSize_t attCount = 0; + XMLSize_t curAttListSize = fAttrList->size(); + wasAdded = false; + + fElemCount++; + + while (true) + { + // And get the next non-space character + XMLCh nextCh = fReaderMgr.peekNextChar(); + + // If the next character is not a slash or closed angle bracket, + // then it must be whitespace, since whitespace is required + // between the end of the last attribute and the name of the next + // one. + if (attCount) + { + if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) + { + if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) + { + // Ok, skip by them and peek another char + fReaderMgr.skipPastSpaces(); + nextCh = fReaderMgr.peekNextChar(); + } + else + { + // Emit the error but keep on going + emitError(XMLErrs::ExpectedWhitespace); + } + } + } + + // Ok, here we first check for any of the special case characters. + // If its not one, then we do the normal case processing, which + // assumes that we've hit an attribute value, Otherwise, we do all + // the special case checks. + if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) + { + // Assume its going to be an attribute, so get a name from + // the input. + + validName = fReaderMgr.getQName(fAttNameBuf, &colonPosition); + if (!validName) + { + if (fAttNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedAttrName); + else + emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); + fReaderMgr.skipPastChar(chCloseAngle); + return false; + } + + // And next must be an equal sign + if (!scanEq()) + { + static const XMLCh tmpList[] = + { + chSingleQuote, chDoubleQuote, chCloseAngle + , chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedEqSign); + + // Try to sync back up by skipping forward until we either + // hit something meaningful. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) + { + // Jump back to top for normal processing of these + continue; + } + else if ((chFound == chSingleQuote) + || (chFound == chDoubleQuote) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through assuming that the value is to follow + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + + // See if this attribute is declared for this element. If we are + // not validating of course it will not be at first, but we will + // fault it into the pool (to avoid lots of redundant errors.) + XMLCh * namePtr = fAttNameBuf.getRawBuffer(); + XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); + + // Skip any whitespace before the value and then scan the att + // value. This will come back normalized with entity refs and + // char refs expanded. + fReaderMgr.skipPastSpaces(); + if (!scanAttValue(attDef, namePtr, fAttValueBuf)) + { + static const XMLCh tmpList[] = + { + chCloseAngle, chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedAttrValue); + + // It failed, so lets try to get synced back up. We skip + // forward until we find some whitespace or one of the + // chars in our list. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) + || (chFound == chForwardSlash) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through and process this attribute, though + // the value will be "". + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + + // Add this attribute to the attribute list that we use to + // pass them to the handler. We reuse its existing elements + // but expand it as required. + // Note that we want to this first since this will + // make a copy of the namePtr; we can then make use of + // that copy in the hashtable lookup that checks + // for duplicates. This will mean we may have to update + // the type of the XMLAttr later. + XMLAttr* curAtt; + const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); + + if (attCount >= curAttListSize) { + curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); + fAttrList->addElement(curAtt); + } + else { + curAtt = fAttrList->elementAt(attCount); + } + + curAtt->setSpecified(true); + // DO NAMESPACES + { + curAtt->set( + fEmptyNamespaceId, namePtr, XMLUni::fgZeroLenString + , (attDef)? attDef->getType() : XMLAttDef::CData + ); + + // each attribute has the prefix:suffix="value" + const XMLCh* attPrefix = curAtt->getPrefix(); + const XMLCh* attLocalName = curAtt->getName(); + + if (attPrefix && *attPrefix) { + if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) { + curAtt->setURIId(fXMLNamespaceId); + } + else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) { + curAtt->setURIId(fXMLNSNamespaceId); + updateNSMap(attPrefix, attLocalName, attrValue); + } + else { + fAttrNSList->addElement(curAtt); + } + } + else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) + { + updateNSMap(attPrefix, XMLUni::fgZeroLenString, attrValue); + } + + // NOTE: duplicate attribute check will be done, when we map + // namespaces to all attributes + if (attDef) { + unsigned int *curCountPtr = fAttDefRegistry->get(attDef); + if (!curCountPtr) { + curCountPtr = getNewUIntPtr(); + *curCountPtr = fElemCount; + fAttDefRegistry->put(attDef, curCountPtr); + } + else if (*curCountPtr < fElemCount) { + *curCountPtr = fElemCount; + } + } + } + + if (fValidate) + { + if (attDef) { + // Let the validator pass judgement on the attribute value + fValidator->validateAttrValue( + attDef, fAttValueBuf.getRawBuffer(), false, elemDecl + ); + } + else + { + fValidator->emitError + ( + XMLValid::AttNotDefinedForElement + , fAttNameBuf.getRawBuffer(), qnameRawBuf + ); + } + } + + // must set the newly-minted value on the XMLAttr: + curAtt->setValue(attrValue); + attCount++; + + // And jump back to the top of the loop + continue; + } + + // It was some special case character so do all of the checks and + // deal with it. + if (!nextCh) + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + + if (nextCh == chForwardSlash) + { + fReaderMgr.getNextChar(); + isEmpty = true; + if (!fReaderMgr.skippedChar(chCloseAngle)) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if (nextCh == chCloseAngle) + { + fReaderMgr.getNextChar(); + break; + } + else if (nextCh == chOpenAngle) + { + // Check for this one specially, since its going to be common + // and it is kind of auto-recovering since we've already hit the + // next open bracket, which is what we would have seeked to (and + // skipped this whole tag.) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) + { + // Check for this one specially, which is probably a missing + // attribute name, e.g. ="value". Just issue expected name + // error and eat the quoted string, then jump back to the + // top again. + emitError(XMLErrs::ExpectedAttrName); + fReaderMgr.getNextChar(); + fReaderMgr.skipQuotedString(nextCh); + fReaderMgr.skipPastSpaces(); + continue; + } + } + + // Make an initial pass through the list and find any xmlns attributes. + if (attCount) + scanAttrListforNameSpaces(fAttrList, attCount, elemDecl); + + if(attCount) + { + // clean up after ourselves: + // clear the map used to detect duplicate attributes + fUndeclaredAttrRegistry->removeAll(); + } + + // Now lets get the fAttrList filled in. This involves faulting in any + // defaulted and fixed attributes and normalizing the values of any that + // we got explicitly. + // + // We update the attCount value with the total number of attributes, but + // it goes in with the number of values we got during the raw scan of + // explictly provided attrs above. + attCount = buildAttList(attCount, elemDecl, *fAttrList); + + // If we have a document handler, then tell it about this start tag. We + // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send + // any prefix since its just one big name if we are not doing namespaces. + if (fDocHandler) + { + unsigned int uriId = resolvePrefix + ( + elemDecl->getElementName()->getPrefix() + , ElemStack::Mode_Element + ); + + fDocHandler->startElement + ( + *elemDecl + , uriId + , elemDecl->getElementName()->getPrefix() + , *fAttrList + , attCount + , isEmpty + , isRoot + ); + } + + // If empty, validate content right now if we are validating and then + // pop the element stack top. Else, we have to update the current stack + // top's namespace mapping elements. + if (isEmpty) + { + // If validating, then insure that its legal to have no content + if (fValidate) + { + XMLSize_t failure; + bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); + if (!res) + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , qnameRawBuf + , elemDecl->getFormattedContentModel() + ); + } + } + + // Pop the element stack back off since it'll never be used now + fElemStack.popTop(); + + // If the elem stack is empty, then it was an empty root + if (isRoot) + gotData = false; + } + + return true; +} + +// --------------------------------------------------------------------------- +// DGXMLScanner: Grammar preparsing +// --------------------------------------------------------------------------- +Grammar* DGXMLScanner::loadGrammar(const InputSource& src + , const short grammarType + , const bool toCache) +{ + Grammar* loadedGrammar = 0; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + try + { + fGrammarResolver->cacheGrammarFromParse(false); + fGrammarResolver->useCachedGrammarInParse(false); + fRootGrammar = 0; + + if (fValScheme == Val_Auto) { + fValidate = true; + } + + // Reset some status flags + fInException = false; + fStandalone = false; + fErrorCount = 0; + fHasNoDTD = true; + + if (grammarType == Grammar::DTDGrammarType) { + loadedGrammar = loadDTDGrammar(src, toCache); + } + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first failure' exception, so fall through + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so fall through + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + return loadedGrammar; +} + +Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src, + const bool toCache) +{ + // Reset the validators + fDTDValidator->reset(); + if (fValidatorFromUser) + fValidator->reset(); + + fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); + fGrammarResolver->putGrammar(fDTDGrammar); + fGrammar = fDTDGrammar; + fValidator->setGrammar(fGrammar); + + // And for all installed handlers, send reset events. This gives them + // a chance to flush any cached data. + if (fDocHandler) + fDocHandler->resetDocument(); + if (fEntityHandler) + fEntityHandler->resetEntities(); + if (fErrorReporter) + fErrorReporter->resetErrors(); + + // Clear out the id reference list + resetValidationContext(); + + if (toCache) { + + unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId()); + const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId); + + fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); + ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); + fGrammarResolver->putGrammar(fGrammar); + } + + // Handle the creation of the XML reader object for this input source. + // This will provide us with transcoding and basic lexing services. + XMLReader* newReader = fReaderMgr.createReader + ( + src + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , fCalculateSrcOfs + , fLowWaterMark + ); + if (!newReader) { + if (src.getIssueFatalErrorIfNotFound()) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); + else + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); + } + + // In order to make the processing work consistently, we have to + // make this look like an external entity. So create an entity + // decl and fill it in and push it with the reader, as happens + // with an external entity. Put a janitor on it to insure it gets + // cleaned up. The reader manager does not adopt them. + const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; + DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); + declDTD->setSystemId(src.getSystemId()); + declDTD->setIsExternal(true); + + // Mark this one as a throw at end + newReader->setThrowAtEnd(true); + + // And push it onto the stack, with its pseudo name + fReaderMgr.pushReader(newReader, declDTD); + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) { + + // Create a dummy root + DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl + ( + gDTDStr + , fEmptyNamespaceId + , DTDElementDecl::Any + , fGrammarPoolMemoryManager + ); + rootDecl->setCreateReason(DTDElementDecl::AsRootElem); + rootDecl->setExternalElemDeclaration(true); + Janitor janSrc(rootDecl); + + fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true); + } + + // Create DTDScanner + DTDScanner dtdScanner + ( + (DTDGrammar*)fGrammar + , fDocTypeHandler + , fGrammarPoolMemoryManager + , fMemoryManager + ); + dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); + + // Tell it its not in an include section + dtdScanner.scanExtSubsetDecl(false, true); + + if (fValidate) { + // validate the DTD scan so far + fValidator->preContentValidation(false, true); + } + + if (toCache) + fGrammarResolver->cacheGrammars(); + + return fDTDGrammar; +} + + +// --------------------------------------------------------------------------- +// DGXMLScanner: Private helper methods +// --------------------------------------------------------------------------- +// This method handles the common initialization, to avoid having to do +// it redundantly in multiple constructors. +void DGXMLScanner::commonInit() +{ + // And we need one for the raw attribute scan. This just stores key/ + // value string pairs (prior to any processing.) + fAttrNSList = new (fMemoryManager) ValueVectorOf(8, fMemoryManager); + + // Create the Validator and init them + fDTDValidator = new (fMemoryManager) DTDValidator(); + initValidator(fDTDValidator); + fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool(29, 128, fMemoryManager); + fAttDefRegistry = new (fMemoryManager) RefHashTableOf + ( + 131, false, fMemoryManager + ); + fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf(7, fMemoryManager); + + if (fValidator) + { + if (!fValidator->handlesDTD()) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); + } + else + { + fValidator = fDTDValidator; + } +} + +void DGXMLScanner::cleanUp() +{ + delete fAttrNSList; + delete fDTDValidator; + delete fDTDElemNonDeclPool; + delete fAttDefRegistry; + delete fUndeclaredAttrRegistry; +} + + +// This method is called from scanStartTagNS() to build up the list of +// XMLAttr objects that will be passed out in the start tag callout. We +// get the key/value pairs from the raw scan of explicitly provided attrs, +// which have not been normalized. And we get the element declaration from +// which we will get any defaulted or fixed attribute defs and add those +// in as well. +XMLSize_t +DGXMLScanner::buildAttList(const XMLSize_t attCount + , XMLElementDecl* elemDecl + , RefVectorOf& toFill) +{ + // Ask the element to clear the 'provided' flag on all of the att defs + // that it owns, and to return us a boolean indicating whether it has + // any defs. + const bool hasDefs = elemDecl->hasAttDefs(); + + // If there are no expliclitily provided attributes and there are no + // defined attributes for the element, the we don't have anything to do. + // So just return zero in this case. + if (!hasDefs && !attCount) + return 0; + + // Keep up with how many attrs we end up with total + XMLSize_t retCount = attCount; + + // And get the current size of the output vector. This lets us use + // existing elements until we fill it, then start adding new ones. + const XMLSize_t curAttListSize = toFill.size(); + + // Ok, so lets get an enumerator for the attributes of this element + // and run through them for well formedness and validity checks. But + // make sure that we had any attributes before we do it, since the list + // would have have gotten faulted in anyway. + if (hasDefs) + { + XMLAttDefList& attDefList = elemDecl->getAttDefList(); + for(XMLSize_t i=0; iget(&curDef); + if (!attCountPtr || *attCountPtr < fElemCount) + { // did not occur + const XMLAttDef::DefAttTypes defType = curDef.getDefaultType(); + + if (fValidate) + { + // If we are validating and its required, then an error + if (defType == XMLAttDef::Required) + { + fValidator->emitError + ( + XMLValid::RequiredAttrNotProvided + , curDef.getFullName() + ); + } + else if ((defType == XMLAttDef::Default) || + (defType == XMLAttDef::Fixed) ) + { + if (fStandalone && curDef.isExternal()) + { + // XML 1.0 Section 2.9 + // Document is standalone, so attributes must not be defaulted. + fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName()); + } + } + } + + // Fault in the value if needed, and bump the att count + if ((defType == XMLAttDef::Default) + || (defType == XMLAttDef::Fixed)) + { + // Let the validator pass judgement on the attribute value + if (fValidate) + { + fValidator->validateAttrValue + ( + &curDef + , curDef.getValue() + , false + , elemDecl + ); + } + + XMLAttr* curAtt; + if (retCount >= curAttListSize) + { + if (fDoNamespaces) + { + curAtt = new (fMemoryManager) XMLAttr + ( + fEmptyNamespaceId + , curDef.getFullName() + , curDef.getValue() + , curDef.getType() + , false + , fMemoryManager + ); + } + else + { + curAtt = new (fMemoryManager) XMLAttr + ( + 0 + , curDef.getFullName() + , XMLUni::fgZeroLenString + , curDef.getValue() + , curDef.getType() + , false + , fMemoryManager + ); + } + + fAttrList->addElement(curAtt); + } + else + { + curAtt = fAttrList->elementAt(retCount); + if (fDoNamespaces) + { + curAtt->set + ( + fEmptyNamespaceId + , curDef.getFullName() + , curDef.getValue() + , curDef.getType() + ); + } + else + { + curAtt->set + ( + 0 + , curDef.getFullName() + , XMLUni::fgZeroLenString + , curDef.getValue() + , curDef.getType() + ); + } + curAtt->setSpecified(false); + } + + if (fDoNamespaces) + { + // Map the new attribute's prefix to a URI id and store + // that in the attribute object. + const XMLCh* attPrefix = curAtt->getPrefix(); + if (attPrefix && *attPrefix) { + curAtt->setURIId + ( + resolvePrefix(attPrefix, ElemStack::Mode_Attribute) + ); + } + } + + retCount++; + } + } + } + } + + return retCount; +} + + +// This method will reset the scanner data structures, and related plugged +// in stuff, for a new scan session. We get the input source for the primary +// XML entity, create the reader for it, and push it on the stack so that +// upon successful return from here we are ready to go. +void DGXMLScanner::scanReset(const InputSource& src) +{ + + // This call implicitly tells us that we are going to reuse the scanner + // if it was previously used. So tell the validator to reset itself. + // + // But, if the fUseCacheGrammar flag is set, then don't reset it. + // + // NOTE: The ReaderMgr is flushed on the way out, because that is + // required to insure that files are closed. + fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar); + fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar); + + fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); + fGrammarResolver->putGrammar(fDTDGrammar); + fGrammar = fDTDGrammar; + fRootGrammar = 0; + fValidator->setGrammar(fGrammar); + + // Reset validation + fValidate = (fValScheme == Val_Always) ? true : false; + + // And for all installed handlers, send reset events. This gives them + // a chance to flush any cached data. + if (fDocHandler) + fDocHandler->resetDocument(); + if (fEntityHandler) + fEntityHandler->resetEntities(); + if (fErrorReporter) + fErrorReporter->resetErrors(); + + // Clear out the id reference list + resetValidationContext(); + + // Reset the Root Element Name + fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName; + fRootElemName = 0; + + // Reset the element stack, and give it the latest ids for the special + // URIs it has to know about. + fElemStack.reset + ( + fEmptyNamespaceId + , fUnknownNamespaceId + , fXMLNamespaceId + , fXMLNSNamespaceId + ); + + // Reset some status flags + fInException = false; + fStandalone = false; + fErrorCount = 0; + fHasNoDTD = true; + + // Reset the validators + fDTDValidator->reset(); + fDTDValidator->setErrorReporter(fErrorReporter); + if (fValidatorFromUser) + fValidator->reset(); + + // Handle the creation of the XML reader object for this input source. + // This will provide us with transcoding and basic lexing services. + XMLReader* newReader = fReaderMgr.createReader + ( + src + , true + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , fCalculateSrcOfs + , fLowWaterMark + ); + + if (!newReader) { + if (src.getIssueFatalErrorIfNotFound()) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); + else + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); + } + + // Push this read onto the reader manager + fReaderMgr.pushReader(newReader, 0); + + // and reset security-related things if necessary: + if(fSecurityManager != 0) + { + fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit(); + fEntityExpansionCount = 0; + } + if(fUIntPoolRowTotal >= 32) + { // 8 KB tied up with validating attributes... + fAttDefRegistry->removeAll(); + recreateUIntPool(); + } + else + { + // note that this will implicitly reset the values of the hashtables, + // though their buckets will still be tied up + resetUIntPool(); + } + fUndeclaredAttrRegistry->removeAll(); + fAttrNSList->removeAllElements(); +} + + +// This method is called between markup in content. It scans for character +// data that is sent to the document handler. It watches for any markup +// characters that would indicate that the character data has ended. It also +// handles expansion of general and character entities. +// +// sendData() is a local static helper for this method which handles some +// code that must be done in three different places here. +void DGXMLScanner::sendCharData(XMLBuffer& toSend) +{ + // If no data in the buffer, then nothing to do + if (toSend.isEmpty()) + return; + + // We do different things according to whether we are validating or + // not. If not, its always just characters; else, it depends on the + // current element's content model. + if (fValidate) + { + // Get the raw data we need for the callback + const XMLCh* const rawBuf = toSend.getRawBuffer(); + const XMLSize_t len = toSend.getLen(); + + // And see if the current element is a 'Children' style content model + const ElemStack::StackElem* topElem = fElemStack.topElement(); + + // Get the character data opts for the current element + XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); + + if (charOpts == XMLElementDecl::NoCharData) + { + // They definitely cannot handle any type of char data + fValidator->emitError(XMLValid::NoCharDataInCM); + } + else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len)) + { + // Its all spaces. So, if they can take spaces, then send it + // as ignorable whitespace. If they can handle any char data + // send it as characters. + if (charOpts == XMLElementDecl::SpacesOk) { + if (fDocHandler) + fDocHandler->ignorableWhitespace(rawBuf, len, false); + } + else if (charOpts == XMLElementDecl::AllCharData) + { + if (fDocHandler) + fDocHandler->docCharacters(rawBuf, len, false); + } + } + else + { + // If they can take any char data, then send it. Otherwise, they + // can only handle whitespace and can't handle this stuff so + // issue an error. + if (charOpts == XMLElementDecl::AllCharData) + { + if (fDocHandler) + fDocHandler->docCharacters(rawBuf, len, false); + } + else + { + fValidator->emitError(XMLValid::NoCharDataInCM); + } + } + } + else + { + // Always assume its just char data if not validating + if (fDocHandler) + fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false); + } + + // Reset buffer + toSend.reset(); +} + + + +// This method is called with a key/value string pair that represents an +// xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the +// current top of the element stack based on this data. We know that when +// we get here, that it is one of these forms, so we don't bother confirming +// it. +// +// But we have to ensure +// 1. xxx is not xmlns +// 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa +// 3. yyy is not XMLUni::fgXMLNSURIName +// 4. if xxx is not null, then yyy cannot be an empty string. +void DGXMLScanner::updateNSMap(const XMLCh* const attrPrefix + , const XMLCh* const attrLocalName + , const XMLCh* const attrValue) +{ + // We either have the default prefix (""), or we point it into the attr + // name parameter. Note that the xmlns is not the prefix we care about + // here. To us, the 'prefix' is really the local part of the attrName + // parameter. + // + // Check 1. xxx is not xmlns + // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa + // 3. yyy is not XMLUni::fgXMLNSURIName + // 4. if xxx is not null, then yyy cannot be an empty string. + if (attrPrefix && *attrPrefix) { + + if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString)) + emitError(XMLErrs::NoUseOfxmlnsAsPrefix); + else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) { + if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName)) + emitError(XMLErrs::PrefixXMLNotMatchXMLURI); + } + + if (!attrValue) + emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); + else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0) + emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); + } + + if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName)) + emitError(XMLErrs::NoUseOfxmlnsURI); + else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) { + if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString)) + emitError(XMLErrs::XMLURINotMatchXMLPrefix); + } + + // Ok, we have to get the unique id for the attribute value, which is the + // URI that this value should be mapped to. The validator has the + // namespace string pool, so we ask him to find or add this new one. Then + // we ask the element stack to add this prefix to URI Id mapping. + fElemStack.addPrefix + ( + attrLocalName + , fURIStringPool->addOrFind(attrValue) + ); +} + +void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf* theAttrList, XMLSize_t attCount, + XMLElementDecl* elemDecl) +{ + // Map prefixes to uris + for (XMLSize_t i=0; i < fAttrNSList->size(); i++) { + XMLAttr* providedAttr = fAttrNSList->elementAt(i); + providedAttr->setURIId( + resolvePrefix(providedAttr->getPrefix(), ElemStack::Mode_Attribute) + ); + } + + fAttrNSList->removeAllElements(); + + // Decide if to use hash table to do duplicate checking + bool toUseHashTable = false; + + setAttrDupChkRegistry(attCount, toUseHashTable); + for (XMLSize_t index = 0; index < attCount; index++) + { + // check for duplicate namespace attributes: + // by checking for qualified names with the same local part and with prefixes + // which have been bound to namespace names that are identical. + XMLAttr* curAttr = theAttrList->elementAt(index); + if (!toUseHashTable) + { + XMLAttr* loopAttr; + for (XMLSize_t attrIndex=0; attrIndex < index; attrIndex++) { + loopAttr = theAttrList->elementAt(attrIndex); + if (loopAttr->getURIId() == curAttr->getURIId() && + XMLString::equals(loopAttr->getName(), curAttr->getName())) { + emitError( + XMLErrs::AttrAlreadyUsedInSTag, curAttr->getName() + , elemDecl->getFullName() + ); + } + } + } + else + { + if (fAttrDupChkRegistry->containsKey((void*)curAttr->getName(), curAttr->getURIId())) + { + emitError( + XMLErrs::AttrAlreadyUsedInSTag + , curAttr->getName(), elemDecl->getFullName() + ); + } + + fAttrDupChkRegistry->put((void*)curAttr->getName(), curAttr->getURIId(), curAttr); + } + } +} + +InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId + ,const XMLCh* const pubId) +{ + //Normalize sysId + XMLBufBid nnSys(&fBufMgr); + XMLBuffer& normalizedSysId = nnSys.getBuffer(); + XMLString::removeChar(sysId, 0xFFFF, normalizedSysId); + const XMLCh* normalizedURI = normalizedSysId.getRawBuffer(); + + // Create a buffer for expanding the normalized system id + XMLBufBid bbSys(&fBufMgr); + XMLBuffer& expSysId = bbSys.getBuffer(); + + // Allow the entity handler to expand the system id if they choose + // to do so. + InputSource* srcToFill = 0; + if (fEntityHandler) + { + if (!fEntityHandler->expandSystemId(normalizedURI, expSysId)) + expSysId.set(normalizedURI); + + ReaderMgr::LastExtEntityInfo lastInfo; + fReaderMgr.getLastExtEntityInfo(lastInfo); + XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, + expSysId.getRawBuffer(), 0, pubId, lastInfo.systemId, + &fReaderMgr); + srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier); + } + else + { + expSysId.set(normalizedURI); + } + + // If they didn't create a source via the entity handler, then we + // have to create one on our own. + if (!srcToFill) + { + if (fDisableDefaultEntityResolution) + return srcToFill; + + ReaderMgr::LastExtEntityInfo lastInfo; + fReaderMgr.getLastExtEntityInfo(lastInfo); + + XMLURL urlTmp(fMemoryManager); + if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) || + (urlTmp.isRelative())) + { + if (!fStandardUriConformant) + { + XMLBufBid ddSys(&fBufMgr); + XMLBuffer& resolvedSysId = ddSys.getBuffer(); + XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId); + + srcToFill = new (fMemoryManager) LocalFileInputSource + ( + lastInfo.systemId + , resolvedSysId.getRawBuffer() + , fMemoryManager + ); + } + else + ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); + } + else + { + if (fStandardUriConformant && urlTmp.hasInvalidChar()) + ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); + srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); + } + } + + return srcToFill; +} + +// --------------------------------------------------------------------------- +// DGXMLScanner: Private parsing methods +// --------------------------------------------------------------------------- +bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef + , const XMLCh *const attrName + , XMLBuffer& toFill) +{ + enum States + { + InWhitespace + , InContent + }; + + // Get the type and name + const XMLAttDef::AttTypes type = (attDef) + ?attDef->getType() + :XMLAttDef::CData; + + // Reset the target buffer + toFill.reset(); + + // Get the next char which must be a single or double quote + XMLCh quoteCh; + if (!fReaderMgr.skipIfQuote(quoteCh)) + return false; + + // We have to get the current reader because we have to ignore closing + // quotes until we hit the same reader again. + const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); + + // check to see if it's a tokenized type that is declared externally + bool isAttTokenizedExternal = (attDef) + ?attDef->isExternal() && (type == XMLAttDef::ID || + type == XMLAttDef::IDRef || + type == XMLAttDef::IDRefs || + type == XMLAttDef::Entity || + type == XMLAttDef::Entities || + type == XMLAttDef::NmToken || + type == XMLAttDef::NmTokens) + :false; + + // Loop until we get the attribute value. Note that we use a double + // loop here to avoid the setup/teardown overhead of the exception + // handler on every round. + XMLCh nextCh; + XMLCh secondCh = 0; + States curState = InContent; + bool firstNonWS = false; + bool gotLeadingSurrogate = false; + bool escaped; + while (true) + { + try + { + while(true) + { + nextCh = fReaderMgr.getNextChar(); + + if (!nextCh) + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + + // Check for our ending quote in the same entity + if (nextCh == quoteCh) + { + if (curReader == fReaderMgr.getCurrentReaderNum()) + return true; + + // Watch for spillover into a previous entity + if (curReader > fReaderMgr.getCurrentReaderNum()) + { + emitError(XMLErrs::PartialMarkupInEntity); + return false; + } + } + + // Check for an entity ref now, before we let it affect our + // whitespace normalization logic below. We ignore the empty flag + // in this one. + escaped = false; + if (nextCh == chAmpersand) + { + if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned) + { + gotLeadingSurrogate = false; + continue; + } + } + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + { + // Deal with surrogate pairs + // Its a leading surrogate. If we already got one, then + // issue an error, else set leading flag to make sure that + // we look for a trailing next time. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + else + gotLeadingSurrogate = true; + } + else + { + // If its a trailing surrogate, make sure that we are + // prepared for that. Else, its just a regular char so make + // sure that we were not expected a trailing surrogate. + if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) + { + // Its trailing, so make sure we were expecting it + if (!gotLeadingSurrogate) + emitError(XMLErrs::Unexpected2ndSurrogateChar); + } + else + { + // Its just a char, so make sure we were not expecting a + // trailing surrogate. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + + // Its got to at least be a valid XML character + if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) + { + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + , fMemoryManager + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); + } + } + gotLeadingSurrogate = false; + } + + // If its not escaped, then make sure its not a < character, which + // is not allowed in attribute values. + if (!escaped && (nextCh == chOpenAngle)) + emitError(XMLErrs::BracketInAttrValue, attrName); + + // If the attribute is a CDATA type we do simple replacement of + // tabs and new lines with spaces, if the character is not escaped + // by way of a char ref. + // + // Otherwise, we do the standard non-CDATA normalization of + // compressing whitespace to single spaces and getting rid of leading + // and trailing whitespace. + if (type == XMLAttDef::CData) + { + if (!escaped) + { + if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D)) + { + // Check Validity Constraint for Standalone document declaration + // XML 1.0, Section 2.9 + if (fStandalone && fValidate && isAttTokenizedExternal) + { + // Can't have a standalone document declaration of "yes" if attribute + // values are subject to normalisation + fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); + } + nextCh = chSpace; + } + } + } + else + { + if (curState == InWhitespace) + { + if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) + { + if (firstNonWS) + toFill.append(chSpace); + curState = InContent; + firstNonWS = true; + } + else + { + continue; + } + } + else if (curState == InContent) + { + if ((nextCh == chSpace) || + (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped)) + { + curState = InWhitespace; + + // Check Validity Constraint for Standalone document declaration + // XML 1.0, Section 2.9 + if (fStandalone && fValidate && isAttTokenizedExternal) + { + if (!firstNonWS || (nextCh != chSpace && fReaderMgr.lookingAtSpace())) + { + // Can't have a standalone document declaration of "yes" if attribute + // values are subject to normalisation + fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); + } + } + continue; + } + firstNonWS = true; + } + } + + // Else add it to the buffer + toFill.append(nextCh); + + if (secondCh) + { + toFill.append(secondCh); + secondCh=0; + } + } + } + catch(const EndOfEntityException&) + { + // Just eat it and continue. + gotLeadingSurrogate = false; + escaped = false; + } + } + return true; +} + + +// This method scans a CDATA section. It collects the character into one +// of the temp buffers and calls the document handler, if any, with the +// characters. It assumes that the fThisElement->getCharDataOpts(); + + while (true) + { + const XMLCh nextCh = fReaderMgr.getNextChar(); + + // Watch for unexpected end of file + if (!nextCh) + { + emitError(XMLErrs::UnterminatedCDATASection); + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + } + + if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))) + { + // This document is standalone; this ignorable CDATA whitespace is forbidden. + // XML 1.0, Section 2.9 + // And see if the current element is a 'Children' style content model + if (topElem->fThisElement->isExternal()) { + + if (charOpts == XMLElementDecl::SpacesOk) // Element Content + { + // Error - standalone should have a value of "no" as whitespace detected in an + // element type with element content whose element declaration was external + fValidator->emitError(XMLValid::NoWSForStandalone); + } + } + } + + // If this is a close square bracket it could be our closing + // sequence. + if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose)) + { + // make sure we were not expecting a trailing surrogate. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + + if (fValidate) { + + if (charOpts != XMLElementDecl::AllCharData) + { + // They definitely cannot handle any type of char data + fValidator->emitError(XMLValid::NoCharDataInCM); + } + } + + // If we have a doc handler, call it + if (fDocHandler) + { + fDocHandler->docCharacters + ( + bbCData.getRawBuffer() + , bbCData.getLen() + , true + ); + } + + // And we are done + break; + } + + // Make sure its a valid character. But if we've emitted an error + // already, don't bother with the overhead since we've already told + // them about it. + if (!emittedError) + { + // Deal with surrogate pairs + if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + { + // Its a leading surrogate. If we already got one, then + // issue an error, else set leading flag to make sure that + // we look for a trailing next time. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + else + gotLeadingSurrogate = true; + } + else + { + // If its a trailing surrogate, make sure that we are + // prepared for that. Else, its just a regular char so make + // sure that we were not expected a trailing surrogate. + if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) + { + // Its trailing, so make sure we were expecting it + if (!gotLeadingSurrogate) + emitError(XMLErrs::Unexpected2ndSurrogateChar); + } + else + { + // Its just a char, so make sure we were not expecting a + // trailing surrogate. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + + // Its got to at least be a valid XML character + else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) + { + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + , fMemoryManager + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); + emittedError = true; + } + } + gotLeadingSurrogate = false; + } + } + + // Add it to the buffer + bbCData.append(nextCh); + } +} + + +void DGXMLScanner::scanCharData(XMLBuffer& toUse) +{ + // We have to watch for the stupid ]]> sequence, which is illegal in + // character data. So this is a little state machine that handles that. + enum States + { + State_Waiting + , State_GotOne + , State_GotTwo + }; + + // Reset the buffer before we start + toUse.reset(); + + // Turn on the 'throw at end' flag of the reader manager + ThrowEOEJanitor jan(&fReaderMgr, true); + + // In order to be more efficient we have to use kind of a deeply nested + // set of blocks here. The outer block puts on a try and catches end of + // entity exceptions. The inner loop is the per-character loop. If we + // put the try inside the inner loop, it would work but would require + // the exception handling code setup/teardown code to be invoked for + // each character. + XMLCh nextCh; + XMLCh secondCh = 0; + States curState = State_Waiting; + bool escaped = false; + bool gotLeadingSurrogate = false; + bool notDone = true; + while (notDone) + { + try + { + while (true) + { + // Eat through as many plain content characters as possible without + // needing special handling. Moving most content characters here, + // in this one call, rather than running the overall loop once + // per content character, is a speed optimization. + if (curState == State_Waiting && !gotLeadingSurrogate) + { + fReaderMgr.movePlainContentChars(toUse); + } + + // Try to get another char from the source + // The code from here on down covers all contengencies, + if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) + { + // If we were waiting for a trailing surrogate, its an error + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + + notDone = false; + break; + } + + // Watch for a reference. Note that the escapement mechanism + // is ignored in this content. + escaped = false; + if (nextCh == chAmpersand) + { + sendCharData(toUse); + + // Turn off the throwing at the end of entity during this + ThrowEOEJanitor jan(&fReaderMgr, false); + + if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned) + { + gotLeadingSurrogate = false; + continue; + } + else + { + if (escaped && !fElemStack.isEmpty()) + fElemStack.setReferenceEscaped(); + } + } + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + { + // Deal with surrogate pairs + // Its a leading surrogate. If we already got one, then + // issue an error, else set leading flag to make sure that + // we look for a trailing next time. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + else + gotLeadingSurrogate = true; + } + else + { + // If its a trailing surrogate, make sure that we are + // prepared for that. Else, its just a regular char so make + // sure that we were not expected a trailing surrogate. + if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) + { + // Its trailing, so make sure we were expecting it + if (!gotLeadingSurrogate) + emitError(XMLErrs::Unexpected2ndSurrogateChar); + } + else + { + // Its just a char, so make sure we were not expecting a + // trailing surrogate. + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); + + // Make sure the returned char is a valid XML char + if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) + { + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + , fMemoryManager + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); + } + } + gotLeadingSurrogate = false; + } + + // Keep the state machine up to date + if (!escaped) + { + if (nextCh == chCloseSquare) + { + if (curState == State_Waiting) + curState = State_GotOne; + else if (curState == State_GotOne) + curState = State_GotTwo; + } + else if (nextCh == chCloseAngle) + { + if (curState == State_GotTwo) + emitError(XMLErrs::BadSequenceInCharData); + curState = State_Waiting; + } + else + { + curState = State_Waiting; + } + } + else + { + curState = State_Waiting; + } + + // Add this char to the buffer + toUse.append(nextCh); + + if (secondCh) + { + toUse.append(secondCh); + secondCh=0; + } + } + } + catch(const EndOfEntityException& toCatch) + { + // Some entity ended, so we have to send any accumulated + // chars and send an end of entity event. + sendCharData(toUse); + gotLeadingSurrogate = false; + + if (fDocHandler) + fDocHandler->endEntityReference(toCatch.getEntity()); + } + } + + // Check the validity constraints as per XML 1.0 Section 2.9 + if (fValidate && fStandalone) + { + // See if the text contains whitespace + // Get the raw data we need for the callback + const XMLCh* rawBuf = toUse.getRawBuffer(); + const XMLSize_t len = toUse.getLen(); + const bool isSpaces = fReaderMgr.getCurrentReader()->containsWhiteSpace(rawBuf, len); + + if (isSpaces) + { + // And see if the current element is a 'Children' style content model + const ElemStack::StackElem* topElem = fElemStack.topElement(); + + if (topElem->fThisElement->isExternal()) { + + // Get the character data opts for the current element + XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); + + if (charOpts == XMLElementDecl::SpacesOk) // => Element Content + { + // Error - standalone should have a value of "no" as whitespace detected in an + // element type with element content whose element declaration was external + // + fValidator->emitError(XMLValid::NoWSForStandalone); + } + } + } + } + // Send any char data that we accumulated into the buffer + sendCharData(toUse); +} + + +// This method will scan a general/character entity ref. It will either +// expand a char ref and return it directly, or push a reader for a general +// entity. +// +// The return value indicates whether the char parameters hold the value +// or whether the value was pushed as a reader, or that it failed. +// +// The escaped flag tells the caller whether the returned parameter resulted +// from a character reference, which escapes the character in some cases. It +// only makes any difference if the return value indicates the value was +// returned directly. +DGXMLScanner::EntityExpRes +DGXMLScanner::scanEntityRef( const bool inAttVal + , XMLCh& firstCh + , XMLCh& secondCh + , bool& escaped) +{ + // Assume no escape + secondCh = 0; + escaped = false; + + // We have to insure that its all in one entity + const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); + + // If the next char is a pound, then its a character reference and we + // need to expand it always. + if (fReaderMgr.skippedChar(chPound)) + { + // Its a character reference, so scan it and get back the numeric + // value it represents. + if (!scanCharRef(firstCh, secondCh)) + return EntityExp_Failed; + + escaped = true; + + if (curReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + return EntityExp_Returned; + } + + // Expand it since its a normal entity ref + XMLBufBid bbName(&fBufMgr); + + int colonPosition; + bool validName = fDoNamespaces ? fReaderMgr.getQName(bbName.getBuffer(), &colonPosition) : + fReaderMgr.getName(bbName.getBuffer()); + if (!validName) + { + if (bbName.isEmpty()) + emitError(XMLErrs::ExpectedEntityRefName); + else + emitError(XMLErrs::InvalidEntityRefName, bbName.getRawBuffer()); + return EntityExp_Failed; + } + + // Next char must be a semi-colon. But if its not, just emit + // an error and try to continue. + if (!fReaderMgr.skippedChar(chSemiColon)) + emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); + + // Make sure we ended up on the same entity reader as the & char + if (curReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + // Look up the name in the general entity pool + XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer()); + + // If it does not exist, then obviously an error + if (!decl) + { + // XML 1.0 Section 4.1 + // Well-formedness Constraint for entity not found: + // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references, + // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset + // or a parameter entity + // + // Else it's Validity Constraint + if (fStandalone || fHasNoDTD) + emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); + else { + if (fValidate) + fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer()); + } + + return EntityExp_Failed; + } + + // XML 1.0 Section 4.1 + // If we are a standalone document, then it has to have been declared + // in the internal subset. + if (fStandalone && !decl->getDeclaredInIntSubset()) + emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer()); + + if (decl->isExternal()) + { + // If its unparsed, then its not valid here + if (decl->isUnparsed()) + { + emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer()); + return EntityExp_Failed; + } + + // If we are in an attribute value, then not valid but keep going + if (inAttVal) + emitError(XMLErrs::NoExtRefsInAttValue); + + // And now create a reader to read this entity + InputSource* srcUsed; + XMLReader* reader = fReaderMgr.createReader + ( + decl->getBaseURI() + , decl->getSystemId() + , decl->getPublicId() + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , srcUsed + , fCalculateSrcOfs + , fLowWaterMark + , fDisableDefaultEntityResolution + ); + + // Put a janitor on the source so it gets cleaned up on exit + Janitor janSrc(srcUsed); + + // If the creation failed, and its not because the source was empty, + // then emit an error and return. + if (!reader) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager); + + // Push the reader. If its a recursive expansion, then emit an error + // and return an failure. + if (!fReaderMgr.pushReader(reader, decl)) + { + emitError(XMLErrs::RecursiveEntity, decl->getName()); + return EntityExp_Failed; + } + + // here's where we need to check if there's a SecurityManager, + // how many entity references we've had + if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { + XMLCh expLimStr[32]; + XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); + emitError + ( + XMLErrs::EntityExpansionLimitExceeded + , expLimStr + ); + // there seems nothing better to do than reset the entity expansion counter + fEntityExpansionCount = 0; + } + + // Do a start entity reference event. + // + // For now, we supress them in att values. Later, when + // the stuff is in place to correctly allow DOM to handle them + // we'll turn this back on. + if (fDocHandler && !inAttVal) + fDocHandler->startEntityReference(*decl); + + // If it starts with the XML string, then parse a text decl + if (checkXMLDecl(true)) + scanXMLDecl(Decl_Text); + } + else + { + // If its one of the special char references, then we can return + // it as a character, and its considered escaped. + if (decl->getIsSpecialChar()) + { + firstCh = decl->getValue()[0]; + escaped = true; + return EntityExp_Returned; + } + + // Create a reader over a memory stream over the entity value + // We force it to assume UTF-16 by passing in an encoding + // string. This way it won't both trying to predecode the + // first line, looking for an XML/TextDecl. + XMLReader* valueReader = fReaderMgr.createIntEntReader + ( + decl->getName() + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , decl->getValue() + , decl->getValueLen() + , false + ); + + // Try to push the entity reader onto the reader manager stack, + // where it will become the subsequent input. If it fails, that + // means the entity is recursive, so issue an error. The reader + // will have just been discarded, but we just keep going. + if (!fReaderMgr.pushReader(valueReader, decl)) + emitError(XMLErrs::RecursiveEntity, decl->getName()); + + // here's where we need to check if there's a SecurityManager, + // how many entity references we've had + if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { + XMLCh expLimStr[32]; + XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); + emitError + ( + XMLErrs::EntityExpansionLimitExceeded + , expLimStr + ); + } + + // Do a start entity reference event. + // + // For now, we supress them in att values. Later, when + // the stuff is in place to correctly allow DOM to handle them + // we'll turn this back on. + if (fDocHandler && !inAttVal) + fDocHandler->startEntityReference(*decl); + + // If it starts with the XML string, then it's an error + if (checkXMLDecl(true)) { + emitError(XMLErrs::TextDeclNotLegalHere); + fReaderMgr.skipPastChar(chCloseAngle); + } + } + return EntityExp_Pushed; +} + + +} diff --git a/IGXMLScanner.cpp b/IGXMLScanner.cpp new file mode 100644 index 000000000..5fe3da6f1 --- /dev/null +++ b/IGXMLScanner.cpp @@ -0,0 +1,3275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * $Id$ + */ + +// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens +// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. + +// --------------------------------------------------------------------------- +// Includes +// --------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace XERCES_CPP_NAMESPACE { + + +typedef JanitorMemFunCall CleanupType; +typedef JanitorMemFunCall ReaderMgrResetType; + + +// --------------------------------------------------------------------------- +// IGXMLScanner: Constructors and Destructor +// --------------------------------------------------------------------------- +IGXMLScanner::IGXMLScanner( XMLValidator* const valToAdopt + , GrammarResolver* const grammarResolver + , MemoryManager* const manager) : + + XMLScanner(valToAdopt, grammarResolver, manager) + , fSeeXsi(false) + , fGrammarType(Grammar::UnKnown) + , fElemStateSize(16) + , fElemState(0) + , fElemLoopState(0) + , fContent(1023, manager) + , fRawAttrList(0) + , fRawAttrColonListSize(32) + , fRawAttrColonList(0) + , fDTDValidator(0) + , fSchemaValidator(0) + , fDTDGrammar(0) + , fICHandler(0) + , fLocationPairs(0) + , fDTDElemNonDeclPool(0) + , fSchemaElemNonDeclPool(0) + , fElemCount(0) + , fAttDefRegistry(0) + , fUndeclaredAttrRegistry(0) + , fPSVIAttrList(0) + , fModel(0) + , fPSVIElement(0) + , fErrorStack(0) + , fSchemaInfoList(0) + , fCachedSchemaInfoList (0) +{ + CleanupType cleanup(this, &IGXMLScanner::cleanUp); + + try + { + commonInit(); + } + catch(const OutOfMemoryException&) + { + // Don't cleanup when out of memory, since executing the + // code can cause problems. + cleanup.release(); + + throw; + } + + cleanup.release(); +} + +IGXMLScanner::IGXMLScanner( XMLDocumentHandler* const docHandler + , DocTypeHandler* const docTypeHandler + , XMLEntityHandler* const entityHandler + , XMLErrorReporter* const errHandler + , XMLValidator* const valToAdopt + , GrammarResolver* const grammarResolver + , MemoryManager* const manager) : + + XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) + , fSeeXsi(false) + , fGrammarType(Grammar::UnKnown) + , fElemStateSize(16) + , fElemState(0) + , fElemLoopState(0) + , fContent(1023, manager) + , fRawAttrList(0) + , fRawAttrColonListSize(32) + , fRawAttrColonList(0) + , fDTDValidator(0) + , fSchemaValidator(0) + , fDTDGrammar(0) + , fICHandler(0) + , fLocationPairs(0) + , fDTDElemNonDeclPool(0) + , fSchemaElemNonDeclPool(0) + , fElemCount(0) + , fAttDefRegistry(0) + , fUndeclaredAttrRegistry(0) + , fPSVIAttrList(0) + , fModel(0) + , fPSVIElement(0) + , fErrorStack(0) + , fSchemaInfoList(0) + , fCachedSchemaInfoList (0) +{ + CleanupType cleanup(this, &IGXMLScanner::cleanUp); + + try + { + commonInit(); + } + catch(const OutOfMemoryException&) + { + // Don't cleanup when out of memory, since executing the + // code can cause problems. + cleanup.release(); + + throw; + } + + cleanup.release(); +} + +IGXMLScanner::~IGXMLScanner() +{ + cleanUp(); +} + +// --------------------------------------------------------------------------- +// XMLScanner: Getter methods +// --------------------------------------------------------------------------- +NameIdPool* IGXMLScanner::getEntityDeclPool() +{ + if(!fDTDGrammar) + return 0; + return fDTDGrammar->getEntityDeclPool(); +} + +const NameIdPool* IGXMLScanner::getEntityDeclPool() const +{ + if(!fDTDGrammar) + return 0; + return fDTDGrammar->getEntityDeclPool(); +} + +// --------------------------------------------------------------------------- +// IGXMLScanner: Main entry point to scan a document +// --------------------------------------------------------------------------- +void IGXMLScanner::scanDocument(const InputSource& src) +{ + // Bump up the sequence id for this parser instance. This will invalidate + // any previous progressive scan tokens. + fSequenceId++; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + try + { + // Reset the scanner and its plugged in stuff for a new run. This + // resets all the data structures, creates the initial reader and + // pushes it on the stack, and sets up the base document path. + scanReset(src); + + // If we have a document handler, then call the start document + if (fDocHandler) + fDocHandler->startDocument(); + + // Scan the prolog part, which is everything before the root element + // including the DTD subsets. + scanProlog(); + + // If we got to the end of input, then its not a valid XML file. + // Else, go on to scan the content. + if (fReaderMgr.atEOF()) + { + emitError(XMLErrs::EmptyMainEntity); + } + else + { + // Scan content, and tell it its not an external entity + if (scanContent()) + { + // Do post-parse validation if required + if (fValidate) + { + // We handle ID reference semantics at this level since + // its required by XML 1.0. + checkIDRefs(); + + // Then allow the validator to do any extra stuff it wants +// fValidator->postParseValidation(); + } + + // That went ok, so scan for any miscellaneous stuff + if (!fReaderMgr.atEOF()) + scanMiscellaneous(); + } + } + + // If we have a document handler, then call the end document + if (fDocHandler) + fDocHandler->endDocument(); + + //cargill debug: + //fGrammarResolver->getXSModel(); + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first failure' exception, so fall through + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so fall through + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } +} + + +bool IGXMLScanner::scanNext(XMLPScanToken& token) +{ + // Make sure this token is still legal + if (!isLegalToken(token)) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); + + // Find the next token and remember the reader id + XMLSize_t orgReader; + XMLTokens curToken; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + bool retVal = true; + + try + { + while (true) + { + // We have to handle any end of entity exceptions that happen here. + // We could be at the end of X nested entities, each of which will + // generate an end of entity exception as we try to move forward. + try + { + curToken = senseNextToken(orgReader); + break; + } + catch(const EndOfEntityException& toCatch) + { + // Send an end of entity reference event + if (fDocHandler) + fDocHandler->endEntityReference(toCatch.getEntity()); + } + } + + if (curToken == Token_CharData) + { + scanCharData(fCDataBuf); + } + else if (curToken == Token_EOF) + { + if (!fElemStack.isEmpty()) + { + const ElemStack::StackElem* topElem = fElemStack.popTop(); + emitError + ( + XMLErrs::EndedWithTagsOnStack + , topElem->fThisElement->getFullName() + ); + } + + retVal = false; + } + else + { + // Its some sort of markup + bool gotData = true; + switch(curToken) + { + case Token_CData : + // Make sure we are within content + if (fElemStack.isEmpty()) + emitError(XMLErrs::CDATAOutsideOfContent); + scanCDSection(); + break; + + case Token_Comment : + scanComment(); + break; + + case Token_EndTag : + scanEndTag(gotData); + break; + + case Token_PI : + scanPI(); + break; + + case Token_StartTag : + if (fDoNamespaces) + scanStartTagNS(gotData); + else + scanStartTag(gotData); + break; + + default : + fReaderMgr.skipToChar(chOpenAngle); + break; + } + + if (orgReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + // If we hit the end, then do the miscellaneous part + if (!gotData) + { + // Do post-parse validation if required + if (fValidate) + { + // We handle ID reference semantics at this level since + // its required by XML 1.0. + checkIDRefs(); + + // Then allow the validator to do any extra stuff it wants +// fValidator->postParseValidation(); + } + + // That went ok, so scan for any miscellaneous stuff + scanMiscellaneous(); + + if (toCheckIdentityConstraint()) + fICHandler->endDocument(); + + if (fDocHandler) + fDocHandler->endDocument(); + } + } + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first failure' exception so return failure + retVal = false; + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so return failure + retVal = false; + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + retVal = false; + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + // If we are not at the end, release the object that will + // reset the ReaderMgr. + if (retVal) + resetReaderMgr.release(); + + return retVal; +} + + + +// --------------------------------------------------------------------------- +// IGXMLScanner: Private helper methods. Most of these are implemented in +// IGXMLScanner2.Cpp. +// --------------------------------------------------------------------------- + +// This method handles the common initialization, to avoid having to do +// it redundantly in multiple constructors. +void IGXMLScanner::commonInit() +{ + + // Create the element state array + fElemState = (unsigned int*) fMemoryManager->allocate + ( + fElemStateSize * sizeof(unsigned int) + ); //new unsigned int[fElemStateSize]; + fElemLoopState = (unsigned int*) fMemoryManager->allocate + ( + fElemStateSize * sizeof(unsigned int) + ); //new unsigned int[fElemStateSize]; + + // And we need one for the raw attribute scan. This just stores key/ + // value string pairs (prior to any processing.) + fRawAttrList = new (fMemoryManager) RefVectorOf(32, true, fMemoryManager); + fRawAttrColonList = (int*) fMemoryManager->allocate + ( + fRawAttrColonListSize * sizeof(int) + ); + + // Create the Validator and init them + fDTDValidator = new (fMemoryManager) DTDValidator(); + initValidator(fDTDValidator); + fSchemaValidator = new (fMemoryManager) SchemaValidator(0, fMemoryManager); + initValidator(fSchemaValidator); + + // Create IdentityConstraint info + fICHandler = new (fMemoryManager) IdentityConstraintHandler(this, fMemoryManager); + + // Create schemaLocation pair info + fLocationPairs = new (fMemoryManager) ValueVectorOf(8, fMemoryManager); + // create pools for undeclared elements + fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool(29, 128, fMemoryManager); + fSchemaElemNonDeclPool = new (fMemoryManager) RefHash3KeysIdPool(29, true, 128, fMemoryManager); + fAttDefRegistry = new (fMemoryManager) RefHashTableOf + ( + 131, false, fMemoryManager + ); + fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf(7, fMemoryManager); + fPSVIAttrList = new (fMemoryManager) PSVIAttributeList(fMemoryManager); + + fSchemaInfoList = new (fMemoryManager) RefHash2KeysTableOf(29, fMemoryManager); + fCachedSchemaInfoList = new (fMemoryManager) RefHash2KeysTableOf(29, fMemoryManager); + + // use fDTDValidator as the default validator + if (!fValidator) + fValidator = fDTDValidator; +} + +void IGXMLScanner::cleanUp() +{ + fMemoryManager->deallocate(fElemState); //delete [] fElemState; + fMemoryManager->deallocate(fElemLoopState); //delete [] fElemLoopState; + delete fRawAttrList; + fMemoryManager->deallocate(fRawAttrColonList); + delete fDTDValidator; + delete fSchemaValidator; + delete fICHandler; + delete fLocationPairs; + delete fDTDElemNonDeclPool; + delete fSchemaElemNonDeclPool; + delete fAttDefRegistry; + delete fUndeclaredAttrRegistry; + delete fPSVIAttrList; + delete fPSVIElement; + delete fErrorStack; + delete fSchemaInfoList; + delete fCachedSchemaInfoList; +} + +// --------------------------------------------------------------------------- +// IGXMLScanner: Private scanning methods +// --------------------------------------------------------------------------- + +// This method is called from scanStartTag() to handle the very raw initial +// scan of the attributes. It just fills in the passed collection with +// key/value pairs for each attribute. No processing is done on them at all. +XMLSize_t +IGXMLScanner::rawAttrScan(const XMLCh* const elemName + , RefVectorOf& toFill + , bool& isEmpty) +{ + // Keep up with how many attributes we've seen so far, and how many + // elements are available in the vector. This way we can reuse old + // elements until we run out and then expand it. + XMLSize_t attCount = 0; + XMLSize_t curVecSize = toFill.size(); + + // Assume it is not empty + isEmpty = false; + + // We loop until we either see a /> or >, handling key/value pairs util + // we get there. We place them in the passed vector, which we will expand + // as required to hold them. + while (true) + { + // Get the next character, which should be non-space + XMLCh nextCh = fReaderMgr.peekNextChar(); + + // If the next character is not a slash or closed angle bracket, + // then it must be whitespace, since whitespace is required + // between the end of the last attribute and the name of the next + // one. + // + if (attCount) + { + if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) + { + bool bFoundSpace; + fReaderMgr.skipPastSpaces(bFoundSpace); + if (!bFoundSpace) + { + // Emit the error but keep on going + emitError(XMLErrs::ExpectedWhitespace); + } + // Ok, peek another char + nextCh = fReaderMgr.peekNextChar(); + } + } + + // Ok, here we first check for any of the special case characters. + // If its not one, then we do the normal case processing, which + // assumes that we've hit an attribute value, Otherwise, we do all + // the special case checks. + if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) + { + // Assume it's going to be an attribute, so get a name from + // the input. + int colonPosition; + if (!fReaderMgr.getQName(fAttNameBuf, &colonPosition)) + { + if (fAttNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedAttrName); + else + emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); + fReaderMgr.skipPastChar(chCloseAngle); + return attCount; + } + + const XMLCh* curAttNameBuf = fAttNameBuf.getRawBuffer(); + + // And next must be an equal sign + if (!scanEq()) + { + static const XMLCh tmpList[] = + { + chSingleQuote, chDoubleQuote, chCloseAngle + , chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedEqSign); + + // Try to sync back up by skipping forward until we either + // hit something meaningful. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) + { + // Jump back to top for normal processing of these + continue; + } + else if ((chFound == chSingleQuote) + || (chFound == chDoubleQuote) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through assuming that the value is to follow + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemName); + return attCount; + } + else + { + // Something went really wrong + return attCount; + } + } + + // Next should be the quoted attribute value. We just do a simple + // and stupid scan of this value. The only thing we do here + // is to expand entity references. + if (!basicAttrValueScan(curAttNameBuf, fAttValueBuf)) + { + static const XMLCh tmpList[] = + { + chCloseAngle, chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedAttrValue); + + // It failed, so lets try to get synced back up. We skip + // forward until we find some whitespace or one of the + // chars in our list. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) + || (chFound == chForwardSlash) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through and process this attribute, though + // the value will be "". + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemName); + return attCount; + } + else + { + // Something went really wrong + return attCount; + } + } + + // And now lets add it to the passed collection. If we have not + // filled it up yet, then we use the next element. Else we add + // a new one. + KVStringPair* curPair = 0; + if (attCount >= curVecSize) + { + curPair = new (fMemoryManager) KVStringPair + ( + curAttNameBuf + , fAttNameBuf.getLen() + , fAttValueBuf.getRawBuffer() + , fAttValueBuf.getLen() + , fMemoryManager + ); + toFill.addElement(curPair); + } + else + { + curPair = toFill.elementAt(attCount); + curPair->set + ( + curAttNameBuf, + fAttNameBuf.getLen(), + fAttValueBuf.getRawBuffer(), + fAttValueBuf.getLen() + ); + } + + if (attCount >= fRawAttrColonListSize) { + resizeRawAttrColonList(); + } + // Set the position of the colon and bump the count of attributes we've gotten + fRawAttrColonList[attCount++] = colonPosition; + + // And go to the top again for another attribute + continue; + } + + // It was some special case character so do all of the checks and + // deal with it. + if (!nextCh) + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + + if (nextCh == chForwardSlash) + { + fReaderMgr.getNextChar(); + isEmpty = true; + if (!fReaderMgr.skippedChar(chCloseAngle)) + emitError(XMLErrs::UnterminatedStartTag, elemName); + break; + } + else if (nextCh == chCloseAngle) + { + fReaderMgr.getNextChar(); + break; + } + else if (nextCh == chOpenAngle) + { + // Check for this one specially, since its going to be common + // and it is kind of auto-recovering since we've already hit the + // next open bracket, which is what we would have seeked to (and + // skipped this whole tag.) + emitError(XMLErrs::UnterminatedStartTag, elemName); + break; + } + else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) + { + // Check for this one specially, which is probably a missing + // attribute name, e.g. ="value". Just issue expected name + // error and eat the quoted string, then jump back to the + // top again. + emitError(XMLErrs::ExpectedAttrName); + fReaderMgr.getNextChar(); + fReaderMgr.skipQuotedString(nextCh); + fReaderMgr.skipPastSpaces(); + continue; + } + } + + return attCount; +} + + +// This method will kick off the scanning of the primary content of the +// document, i.e. the elements. +bool IGXMLScanner::scanContent() +{ + // Go into a loop until we hit the end of the root element, or we fall + // out because there is no root element. + // + // We have to do kind of a deeply nested double loop here in order to + // avoid doing the setup/teardown of the exception handler on each + // round. Doing it this way we only do it when an exception actually + // occurs. + bool gotData = true; + bool inMarkup = false; + while (gotData) + { + try + { + while (gotData) + { + // Sense what the next top level token is. According to what + // this tells us, we will call something to handle that kind + // of thing. + XMLSize_t orgReader; + const XMLTokens curToken = senseNextToken(orgReader); + + // Handle character data and end of file specially. Char data + // is not markup so we don't want to handle it in the loop + // below. + if (curToken == Token_CharData) + { + // Scan the character data and call appropriate events. Let + // him use our local character data buffer for efficiency. + scanCharData(fCDataBuf); + continue; + } + else if (curToken == Token_EOF) + { + // The element stack better be empty at this point or we + // ended prematurely before all elements were closed. + if (!fElemStack.isEmpty()) + { + const ElemStack::StackElem* topElem = fElemStack.popTop(); + emitError + ( + XMLErrs::EndedWithTagsOnStack + , topElem->fThisElement->getFullName() + ); + } + + // Its the end of file, so clear the got data flag + gotData = false; + continue; + } + + // We are in some sort of markup now + inMarkup = true; + + // According to the token we got, call the appropriate + // scanning method. + switch(curToken) + { + case Token_CData : + // Make sure we are within content + if (fElemStack.isEmpty()) + emitError(XMLErrs::CDATAOutsideOfContent); + scanCDSection(); + break; + + case Token_Comment : + scanComment(); + break; + + case Token_EndTag : + scanEndTag(gotData); + break; + + case Token_PI : + scanPI(); + break; + + case Token_StartTag : + if (fDoNamespaces) + scanStartTagNS(gotData); + else + scanStartTag(gotData); + break; + + default : + fReaderMgr.skipToChar(chOpenAngle); + break; + } + + if (orgReader != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialMarkupInEntity); + + // And we are back out of markup again + inMarkup = false; + } + } + catch(const EndOfEntityException& toCatch) + { + // If we were in some markup when this happened, then its a + // partial markup error. + if (inMarkup) + emitError(XMLErrs::PartialMarkupInEntity); + + // Send an end of entity reference event + if (fDocHandler) + fDocHandler->endEntityReference(toCatch.getEntity()); + + inMarkup = false; + } + } + + // It went ok, so return success + return true; +} + + +void IGXMLScanner::scanEndTag(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the end of the root element. + gotData = true; + + // Check if the element stack is empty. If so, then this is an unbalanced + // element (i.e. more ends than starts, perhaps because of bad text + // causing one to be skipped.) + if (fElemStack.isEmpty()) + { + emitError(XMLErrs::MoreEndThanStartTags); + fReaderMgr.skipPastChar(chCloseAngle); + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); + } + + // Pop the stack of the element we are supposed to be ending. Remember + // that we don't own this. The stack just keeps them and reuses them. + unsigned int uriId = (fDoNamespaces) + ? fElemStack.getCurrentURI() : fEmptyNamespaceId; + + // these get initialized below + const ElemStack::StackElem* topElem = 0; + const XMLCh *elemName = 0; + + // Make sure that its the end of the element that we expect + // special case for schema validation, whose element decls, + // obviously don't contain prefix information + if(fGrammarType == Grammar::SchemaGrammarType) + { + elemName = fElemStack.getCurrentSchemaElemName(); + topElem = fElemStack.topElement(); + } + else + { + topElem = fElemStack.topElement(); + elemName = topElem->fThisElement->getFullName(); + } + if (!fReaderMgr.skippedStringLong(elemName)) + { + emitError + ( + XMLErrs::ExpectedEndOfTagX + , elemName + ); + fReaderMgr.skipPastChar(chCloseAngle); + fElemStack.popTop(); + return; + } + + // Make sure we are back on the same reader as where we started + if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) + emitError(XMLErrs::PartialTagMarkupError); + + // Skip optional whitespace + fReaderMgr.skipPastSpaces(); + + // Make sure we find the closing bracket + if (!fReaderMgr.skippedChar(chCloseAngle)) + { + emitError + ( + XMLErrs::UnterminatedEndTag + , topElem->fThisElement->getFullName() + ); + } + + if (fGrammarType == Grammar::SchemaGrammarType) + { + // reset error occurred + fPSVIElemContext.fErrorOccurred = fErrorStack->pop(); + if (fValidate && topElem->fThisElement->isDeclared()) + { + fPSVIElemContext.fCurrentTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); + if(!fPSVIElemContext.fCurrentTypeInfo) + fPSVIElemContext.fCurrentDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); + else + fPSVIElemContext.fCurrentDV = 0; + if(fPSVIHandler) + { + fPSVIElemContext.fNormalizedValue = ((SchemaValidator*) fValidator)->getNormalizedValue(); + + if (XMLString::equals(fPSVIElemContext.fNormalizedValue, XMLUni::fgZeroLenString)) + fPSVIElemContext.fNormalizedValue = 0; + } + } + else + { + fPSVIElemContext.fCurrentDV = 0; + fPSVIElemContext.fCurrentTypeInfo = 0; + fPSVIElemContext.fNormalizedValue = 0; + } + } + + // If validation is enabled, then lets pass him the list of children and + // this element and let him validate it. + DatatypeValidator* psviMemberType = 0; + if (fValidate) + { + + // + // XML1.0-3rd + // Validity Constraint: + // The declaration matches EMPTY and the element has no content (not even + // entity references, comments, PIs or white space). + // + if ( (fGrammarType == Grammar::DTDGrammarType) && + (topElem->fCommentOrPISeen) && + (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty)) + { + fValidator->emitError + ( + XMLValid::EmptyElemHasContent + , topElem->fThisElement->getFullName() + ); + } + + // + // XML1.0-3rd + // Validity Constraint: + // + // The declaration matches children and the sequence of child elements + // belongs to the language generated by the regular expression in the + // content model, with optional white space, comments and PIs + // (i.e. markup matching production [27] Misc) between the start-tag and + // the first child element, between child elements, or between the last + // child element and the end-tag. + // + // Note that + // a CDATA section containing only white space or + // a reference to an entity whose replacement text is character references + // expanding to white space do not match the nonterminal S, and hence + // cannot appear in these positions; however, + // a reference to an internal entity with a literal value consisting + // of character references expanding to white space does match S, + // since its replacement text is the white space resulting from expansion + // of the character references. + // + if ( (fGrammarType == Grammar::DTDGrammarType) && + (topElem->fReferenceEscaped) && + (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children)) + { + fValidator->emitError + ( + XMLValid::ElemChildrenHasInvalidWS + , topElem->fThisElement->getFullName() + ); + } + XMLSize_t failure; + bool res = fValidator->checkContent + ( + topElem->fThisElement + , topElem->fChildren + , topElem->fChildCount + , &failure + ); + + if (!res) + { + // One of the elements is not valid for the content. NOTE that + // if no children were provided but the content model requires + // them, it comes back with a zero value. But we cannot use that + // to index the child array in this case, and have to put out a + // special message. + if (!topElem->fChildCount) + { + fValidator->emitError + ( + XMLValid::EmptyNotValidForContent + , topElem->fThisElement->getFormattedContentModel() + ); + } + else if (failure >= topElem->fChildCount) + { + fValidator->emitError + ( + XMLValid::NotEnoughElemsForCM + , topElem->fThisElement->getFormattedContentModel() + ); + } + else + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , topElem->fChildren[failure]->getRawName() + , topElem->fThisElement->getFormattedContentModel() + ); + } + } + + + if (fGrammarType == Grammar::SchemaGrammarType) { + if (((SchemaValidator*) fValidator)->getErrorOccurred()) + fPSVIElemContext.fErrorOccurred = true; + else if (fPSVIElemContext.fCurrentDV && fPSVIElemContext.fCurrentDV->getType() == DatatypeValidator::Union) + psviMemberType = fValidationContext->getValidatingMemberType(); + + if (fPSVIHandler) + { + fPSVIElemContext.fIsSpecified = ((SchemaValidator*) fValidator)->getIsElemSpecified(); + if(fPSVIElemContext.fIsSpecified) + fPSVIElemContext.fNormalizedValue = ((SchemaElementDecl *)topElem->fThisElement)->getDefaultValue(); + } + + // call matchers and de-activate context + if (toCheckIdentityConstraint()) + { + fICHandler->deactivateContext + ( + (SchemaElementDecl *) topElem->fThisElement + , fContent.getRawBuffer() + , fValidationContext + , fPSVIElemContext.fCurrentDV + ); + } + + } + } + + // QName dv needed topElem to resolve URIs on the checkContent + fElemStack.popTop(); + + // See if it was the root element, to avoid multiple calls below + const bool isRoot = fElemStack.isEmpty(); + + if (fGrammarType == Grammar::SchemaGrammarType) + { + if (fPSVIHandler) + { + endElementPSVI( + (SchemaElementDecl*)topElem->fThisElement, psviMemberType); + } + // now we can reset the datatype buffer, since the + // application has had a chance to copy the characters somewhere else + ((SchemaValidator *)fValidator)->clearDatatypeBuffer(); + } + + // If we have a doc handler, tell it about the end tag + if (fDocHandler) + { + if (fGrammarType == Grammar::SchemaGrammarType) { + if (topElem->fPrefixColonPos != -1) + fPrefixBuf.set(elemName, topElem->fPrefixColonPos); + else + fPrefixBuf.reset(); + } + else { + fPrefixBuf.set(topElem->fThisElement->getElementName()->getPrefix()); + } + fDocHandler->endElement + ( + *topElem->fThisElement + , uriId + , isRoot + , fPrefixBuf.getRawBuffer() + ); + } + + if (fGrammarType == Grammar::SchemaGrammarType) { + if (!isRoot) + { + // update error information + fErrorStack->push((fErrorStack->size() && fErrorStack->pop()) || fPSVIElemContext.fErrorOccurred); + + + } + } + + // If this was the root, then done with content + gotData = !isRoot; + + if (gotData) { + if (fDoNamespaces) { + // Restore the grammar + fGrammar = fElemStack.getCurrentGrammar(); + fGrammarType = fGrammar->getGrammarType(); + if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) { + if (fValidatorFromUser) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoSchemaValidator, fMemoryManager); + else { + fValidator = fSchemaValidator; + } + } + else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) { + if (fValidatorFromUser) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); + else { + fValidator = fDTDValidator; + } + } + + fValidator->setGrammar(fGrammar); + } + + // Restore the validation flag + fValidate = fElemStack.getValidationFlag(); + } +} + + +// This method handles the high level logic of scanning the DOCType +// declaration. This calls the DTDScanner and kicks off both the scanning of +// the internal subset and the scanning of the external subset, if any. +// +// When we get here the 'resetDocType(); + + // There must be some space after DOCTYPE + bool skippedSomething; + fReaderMgr.skipPastSpaces(skippedSomething); + if (!skippedSomething) + { + emitError(XMLErrs::ExpectedWhitespace); + + // Just skip the Doctype declaration and return + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Get a buffer for the root element + XMLBufBid bbRootName(&fBufMgr); + + // Get a name from the input, which should be the name of the root + // element of the upcoming content. + int colonPosition; + bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) : + fReaderMgr.getName(bbRootName.getBuffer()); + if (!validName) + { + if (bbRootName.isEmpty()) + emitError(XMLErrs::NoRootElemInDOCTYPE); + else + emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer()); + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Store the root element name for later check + setRootElemName(bbRootName.getRawBuffer()); + + // This element obviously is not going to exist in the element decl + // pool yet, but we need to call docTypeDecl. So force it into + // the element decl pool, marked as being there because it was in + // the DOCTYPE. Later, when its declared, the status will be updated. + // + // Only do this if we are not reusing the validator! If we are reusing, + // then look it up instead. It has to exist! + MemoryManager* const rootDeclMgr = + fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager; + + DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl + ( + bbRootName.getRawBuffer() + , fEmptyNamespaceId + , DTDElementDecl::Any + , rootDeclMgr + ); + + Janitor rootDeclJanitor(rootDecl); + rootDecl->setCreateReason(DTDElementDecl::AsRootElem); + rootDecl->setExternalElemDeclaration(true); + if(!fUseCachedGrammar) + { + fGrammar->putElemDecl(rootDecl); + rootDeclJanitor.release(); + } else + { + // attach this to the undeclared element pool so that it gets deleted + XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer()); + if (elemDecl) + { + rootDecl->setId(elemDecl->getId()); + } + else + { + rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl)); + rootDeclJanitor.release(); + } + } + + // Skip any spaces after the name + fReaderMgr.skipPastSpaces(); + + // And now if we are looking at a >, then we are done. It is not + // required to have an internal or external subset, though why you + // would not escapes me. + if (fReaderMgr.skippedChar(chCloseAngle)) { + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) + fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false); + return; + } + + // either internal/external subset + if (fValScheme == Val_Auto && !fValidate) + fValidate = true; + + bool hasIntSubset = false; + bool hasExtSubset = false; + XMLCh* sysId = 0; + XMLCh* pubId = 0; + + DTDScanner dtdScanner + ( + (DTDGrammar*) fGrammar + , fDocTypeHandler + , fGrammarPoolMemoryManager + , fMemoryManager + ); + dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); + + // If the next character is '[' then we have no external subset cause + // there is no system id, just the opening character of the internal + // subset. Else, has to be an id. + // + // Just look at the next char, don't eat it. + if (fReaderMgr.peekNextChar() == chOpenSquare) + { + hasIntSubset = true; + } + else + { + // Indicate we have an external subset + hasExtSubset = true; + fHasNoDTD = false; + + // Get buffers for the ids + XMLBufBid bbPubId(&fBufMgr); + XMLBufBid bbSysId(&fBufMgr); + + // Get the external subset id + if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External)) + { + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Get copies of the ids we got + pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager); + sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager); + } + + // Insure that the ids get cleaned up, if they got allocated + ArrayJanitor janSysId(sysId, fMemoryManager); + ArrayJanitor janPubId(pubId, fMemoryManager); + + if (hasExtSubset) + { + // Skip spaces and check again for the opening of an internal subset + fReaderMgr.skipPastSpaces(); + + // Just look at the next char, don't eat it. + if (fReaderMgr.peekNextChar() == chOpenSquare) { + hasIntSubset = true; + } + } + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) + fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset); + + // Ok, if we had an internal subset, we are just past the [ character + // and need to parse that first. + if (hasIntSubset) + { + // Eat the opening square bracket + fReaderMgr.getNextChar(); + + checkInternalDTD(hasExtSubset, sysId, pubId); + + // And try to scan the internal subset. If we fail, try to recover + // by skipping forward tot he close angle and returning. + if (!dtdScanner.scanInternalSubset()) + { + fReaderMgr.skipPastChar(chCloseAngle); + return; + } + + // Do a sanity check that some expanded PE did not propogate out of + // the doctype. This could happen if it was terminated early by bad + // syntax. + if (fReaderMgr.getReaderDepth() > 1) + { + emitError(XMLErrs::PEPropogated); + + // Ask the reader manager to pop back down to the main level + fReaderMgr.cleanStackBackTo(1); + } + + fReaderMgr.skipPastSpaces(); + } + + // And that should leave us at the closing > of the DOCTYPE line + if (!fReaderMgr.skippedChar(chCloseAngle)) + { + // Do a special check for the common scenario of an extra ] char at + // the end. This is easy to recover from. + if (fReaderMgr.skippedChar(chCloseSquare) + && fReaderMgr.skippedChar(chCloseAngle)) + { + emitError(XMLErrs::ExtraCloseSquare); + } + else + { + emitError(XMLErrs::UnterminatedDOCTYPE); + fReaderMgr.skipPastChar(chCloseAngle); + } + } + + // If we had an external subset, then we need to deal with that one + // next. If we are reusing the validator, then don't scan it. + if (hasExtSubset) { + + InputSource* srcUsed=0; + Janitor janSrc(srcUsed); + // If we had an internal subset and we're using the cached grammar, it + // means that the ignoreCachedDTD is set, so we ignore the cached + // grammar + if (fUseCachedGrammar && !hasIntSubset) + { + srcUsed = resolveSystemId(sysId, pubId); + if (srcUsed) { + janSrc.reset(srcUsed); + Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId()); + + if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) { + + fDTDGrammar = (DTDGrammar*) grammar; + fGrammar = fDTDGrammar; + fValidator->setGrammar(fGrammar); + // If we don't report at least the external subset boundaries, + // an advanced document handler cannot know when the DTD end, + // since we've already sent a doctype decl that indicates there's + // there's an external subset. + if (fDocTypeHandler) + { + fDocTypeHandler->startExtSubset(); + fDocTypeHandler->endExtSubset(); + } + + return; + } + } + } + + if (fLoadExternalDTD || fValidate) + { + // And now create a reader to read this entity + XMLReader* reader; + if (srcUsed) { + reader = fReaderMgr.createReader + ( + *srcUsed + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , fCalculateSrcOfs + , fLowWaterMark + ); + } + else { + reader = fReaderMgr.createReader + ( + sysId + , pubId + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , srcUsed + , fCalculateSrcOfs + , fLowWaterMark + , fDisableDefaultEntityResolution + ); + janSrc.reset(srcUsed); + } + // If it failed then throw an exception + if (!reader) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager); + + if (fToCacheGrammar) { + + unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId()); + const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId); + + fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); + ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); + fGrammarResolver->putGrammar(fGrammar); + } + + // In order to make the processing work consistently, we have to + // make this look like an external entity. So create an entity + // decl and fill it in and push it with the reader, as happens + // with an external entity. Put a janitor on it to insure it gets + // cleaned up. The reader manager does not adopt them. + const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; + DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); + declDTD->setSystemId(sysId); + declDTD->setIsExternal(true); + + // Mark this one as a throw at end + reader->setThrowAtEnd(true); + + // And push it onto the stack, with its pseudo name + fReaderMgr.pushReader(reader, declDTD); + + // Tell it its not in an include section + dtdScanner.scanExtSubsetDecl(false, true); + } + } +} + +bool IGXMLScanner::scanStartTag(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the root and its empty. + gotData = true; + + // Get the QName. In this case, we are not doing namespaces, so we just + // use it as is and don't have to break it into parts. + if (!fReaderMgr.getName(fQNameBuf)) + { + emitError(XMLErrs::ExpectedElementName); + fReaderMgr.skipToChar(chOpenAngle); + return false; + } + + // Assume it won't be an empty tag + bool isEmpty = false; + + // Lets try to look up the element in the validator's element decl pool + // We can pass bogus values for the URI id and the base name. We know that + // this can only be called if we are doing a DTD style validator and that + // he will only look at the QName. + // + // We tell him to fault in a decl if he does not find one. + // Actually, we *don't* tell him to fault in a decl if he does not find one- NG + bool wasAdded = false; + const XMLCh *rawQName = fQNameBuf.getRawBuffer(); + XMLElementDecl* elemDecl = fGrammar->getElemDecl + ( + fEmptyNamespaceId + , 0 + , rawQName + , Grammar::TOP_LEVEL_SCOPE + ); + // look for it in the undeclared pool: + if(!elemDecl) + { + elemDecl = fDTDElemNonDeclPool->getByKey(rawQName); + } + if(!elemDecl) + { + // we're assuming this must be a DTD element. DTD's can be + // used with or without namespaces, but schemas cannot be used without + // namespaces. + wasAdded = true; + elemDecl = new (fMemoryManager) DTDElementDecl + ( + rawQName + , fEmptyNamespaceId + , DTDElementDecl::Any + , fMemoryManager + ); + elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); + } + + // We do something different here according to whether we found the + // element or not. + if (wasAdded) + { + // If validating then emit an error + if (fValidate) + { + // This is to tell the reuse Validator that this element was + // faulted-in, was not an element in the validator pool originally + elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); + + fValidator->emitError + ( + XMLValid::ElementNotDefined + , elemDecl->getFullName() + ); + } + } + else + { + // If its not marked declared and validating, then emit an error + if (fValidate && !elemDecl->isDeclared()) + { + fValidator->emitError + ( + XMLValid::ElementNotDefined + , elemDecl->getFullName() + ); + } + } + + // See if its the root element + const bool isRoot = fElemStack.isEmpty(); + + // Expand the element stack and add the new element + fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); + fElemStack.setValidationFlag(fValidate); + + // Validate the element + if (fValidate) + fValidator->validateElement(elemDecl); + + // If this is the first element and we are validating, check the root + // element. + if (isRoot) + { + fRootGrammar = fGrammar; + + if (fValidate) + { + // If a DocType exists, then check if it matches the root name there. + if (fRootElemName && !XMLString::equals(fQNameBuf.getRawBuffer(), fRootElemName)) + fValidator->emitError(XMLValid::RootElemNotLikeDocType); + } + } + else + { + // If the element stack is not empty, then add this element as a + // child of the previous top element. If its empty, this is the root + // elem and is not the child of anything. + fElemStack.addChild(elemDecl->getElementName(), true); + } + + // Skip any whitespace after the name + fReaderMgr.skipPastSpaces(); + + // We loop until we either see a /> or >, handling attribute/value + // pairs until we get there. + XMLSize_t attCount = 0; + XMLSize_t curAttListSize = fAttrList->size(); + wasAdded = false; + + fElemCount++; + + while (true) + { + // And get the next non-space character + XMLCh nextCh = fReaderMgr.peekNextChar(); + + // If the next character is not a slash or closed angle bracket, + // then it must be whitespace, since whitespace is required + // between the end of the last attribute and the name of the next + // one. + if (attCount) + { + if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) + { + bool bFoundSpace; + fReaderMgr.skipPastSpaces(bFoundSpace); + if (!bFoundSpace) + { + // Emit the error but keep on going + emitError(XMLErrs::ExpectedWhitespace); + } + // Ok, peek another char + nextCh = fReaderMgr.peekNextChar(); + } + } + + // Ok, here we first check for any of the special case characters. + // If its not one, then we do the normal case processing, which + // assumes that we've hit an attribute value, Otherwise, we do all + // the special case checks. + if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) + { + // Assume its going to be an attribute, so get a name from + // the input. + if (!fReaderMgr.getName(fAttNameBuf)) + { + emitError(XMLErrs::ExpectedAttrName); + fReaderMgr.skipPastChar(chCloseAngle); + return false; + } + + // And next must be an equal sign + if (!scanEq()) + { + static const XMLCh tmpList[] = + { + chSingleQuote, chDoubleQuote, chCloseAngle + , chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedEqSign); + + // Try to sync back up by skipping forward until we either + // hit something meaningful. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) + { + // Jump back to top for normal processing of these + continue; + } + else if ((chFound == chSingleQuote) + || (chFound == chDoubleQuote) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through assuming that the value is to follow + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + // See if this attribute is declared for this element. If we are + // not validating of course it will not be at first, but we will + // fault it into the pool (to avoid lots of redundant errors.) + XMLCh * namePtr = fAttNameBuf.getRawBuffer(); + XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); + + // Add this attribute to the attribute list that we use to + // pass them to the handler. We reuse its existing elements + // but expand it as required. + // Note that we want to this first since this will + // make a copy of the namePtr; we can then make use of + // that copy in the hashtable lookup that checks + // for duplicates. This will mean we may have to update + // the type of the XMLAttr later. + XMLAttr* curAtt; + if (attCount >= curAttListSize) + { + curAtt = new (fMemoryManager) XMLAttr + ( + 0 + , namePtr + , XMLUni::fgZeroLenString + , XMLUni::fgZeroLenString + , (attDef)?attDef->getType():XMLAttDef::CData + , true + , fMemoryManager + ); + fAttrList->addElement(curAtt); + } + else + { + curAtt = fAttrList->elementAt(attCount); + curAtt->set + ( + 0 + , namePtr + , XMLUni::fgZeroLenString + , XMLUni::fgZeroLenString + , (attDef)?attDef->getType():XMLAttDef::CData + ); + curAtt->setSpecified(true); + } + // reset namePtr so it refers to newly-allocated memory + namePtr = (XMLCh *)curAtt->getName(); + + if (!attDef) + { + // If there is a validation handler, then we are validating + // so emit an error. + if (fValidate) + { + fValidator->emitError + ( + XMLValid::AttNotDefinedForElement + , fAttNameBuf.getRawBuffer() + , elemDecl->getFullName() + ); + } + if(!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0)) + { + emitError + ( + XMLErrs::AttrAlreadyUsedInSTag + , namePtr + , elemDecl->getFullName() + ); + } + } + else + { + // prepare for duplicate detection + unsigned int *curCountPtr = fAttDefRegistry->get(attDef); + if(!curCountPtr) + { + curCountPtr = getNewUIntPtr(); + *curCountPtr = fElemCount; + fAttDefRegistry->put(attDef, curCountPtr); + } + else if(*curCountPtr < fElemCount) + *curCountPtr = fElemCount; + else + { + emitError + ( + XMLErrs::AttrAlreadyUsedInSTag + , attDef->getFullName() + , elemDecl->getFullName() + ); + } + } + + // Skip any whitespace before the value and then scan the att + // value. This will come back normalized with entity refs and + // char refs expanded. + fReaderMgr.skipPastSpaces(); + if (!scanAttValue(attDef, namePtr, fAttValueBuf)) + { + static const XMLCh tmpList[] = + { + chCloseAngle, chOpenAngle, chForwardSlash, chNull + }; + + emitError(XMLErrs::ExpectedAttrValue); + + // It failed, so lets try to get synced back up. We skip + // forward until we find some whitespace or one of the + // chars in our list. + const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); + + if ((chFound == chCloseAngle) + || (chFound == chForwardSlash) + || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) + { + // Just fall through and process this attribute, though + // the value will be "". + } + else if (chFound == chOpenAngle) + { + // Assume a malformed tag and that new one is starting + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + return false; + } + else + { + // Something went really wrong + return false; + } + } + // must set the newly-minted value on the XMLAttr: + curAtt->setValue(fAttValueBuf.getRawBuffer()); + + // Now that its all stretched out, lets look at its type and + // determine if it has a valid value. It will output any needed + // errors, but we just keep going. We only need to do this if + // we are validating. + if (attDef) + { + // Let the validator pass judgement on the attribute value + if (fValidate) + { + fValidator->validateAttrValue + ( + attDef + , fAttValueBuf.getRawBuffer() + , false + , elemDecl + ); + } + } + + attCount++; + // And jump back to the top of the loop + continue; + } + + // It was some special case character so do all of the checks and + // deal with it. + if (!nextCh) + ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); + + if (nextCh == chForwardSlash) + { + fReaderMgr.getNextChar(); + isEmpty = true; + if (!fReaderMgr.skippedChar(chCloseAngle)) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if (nextCh == chCloseAngle) + { + fReaderMgr.getNextChar(); + break; + } + else if (nextCh == chOpenAngle) + { + // Check for this one specially, since its going to be common + // and it is kind of auto-recovering since we've already hit the + // next open bracket, which is what we would have seeked to (and + // skipped this whole tag.) + emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); + break; + } + else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) + { + // Check for this one specially, which is probably a missing + // attribute name, e.g. ="value". Just issue expected name + // error and eat the quoted string, then jump back to the + // top again. + emitError(XMLErrs::ExpectedAttrName); + fReaderMgr.getNextChar(); + fReaderMgr.skipQuotedString(nextCh); + fReaderMgr.skipPastSpaces(); + continue; + } + } + + if(attCount) + { + // clean up after ourselves: + // clear the map used to detect duplicate attributes + fUndeclaredAttrRegistry->removeAll(); + } + + // Ok, so lets get an enumerator for the attributes of this element + // and run through them for well formedness and validity checks. But + // make sure that we had any attributes before we do it, since the list + // would have have gotten faulted in anyway. + if (elemDecl->hasAttDefs()) + { + // N.B.: this assumes DTD validation. + XMLAttDefList& attDefList = elemDecl->getAttDefList(); + for(XMLSize_t i=0; iget(&curDef); + if (!attCountPtr || *attCountPtr < fElemCount) + { // did not occur + if (fValidate) + { + // If we are validating and its required, then an error + if (defType == XMLAttDef::Required) + { + fValidator->emitError + ( + XMLValid::RequiredAttrNotProvided + , curDef.getFullName() + ); + } + else if ((defType == XMLAttDef::Default) || + (defType == XMLAttDef::Fixed) ) + { + if (fStandalone && curDef.isExternal()) + { + // XML 1.0 Section 2.9 + // Document is standalone, so attributes must not be defaulted. + fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName()); + + } + } + } + + // Fault in the value if needed, and bump the att count + if ((defType == XMLAttDef::Default) + || (defType == XMLAttDef::Fixed)) + { + // Let the validator pass judgement on the attribute value + if (fValidate) + { + fValidator->validateAttrValue + ( + &curDef + , curDef.getValue() + , false + , elemDecl + ); + } + + XMLAttr* curAtt; + if (attCount >= curAttListSize) + { + curAtt = new (fMemoryManager) XMLAttr + ( + 0 + , curDef.getFullName() + , XMLUni::fgZeroLenString + , curDef.getValue() + , curDef.getType() + , false + , fMemoryManager + ); + fAttrList->addElement(curAtt); + curAttListSize++; + } + else + { + curAtt = fAttrList->elementAt(attCount); + curAtt->set + ( + 0 + , curDef.getFullName() + , XMLUni::fgZeroLenString + , curDef.getValue() + , curDef.getType() + ); + curAtt->setSpecified(false); + } + attCount++; + } + } + } + } + + // If empty, validate content right now if we are validating and then + // pop the element stack top. Else, we have to update the current stack + // top's namespace mapping elements. + if (isEmpty) + { + // If validating, then insure that its legal to have no content + if (fValidate) + { + XMLSize_t failure; + bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); + if (!res) + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , elemDecl->getFullName() + , elemDecl->getFormattedContentModel() + ); + } + } + + // Pop the element stack back off since it'll never be used now + fElemStack.popTop(); + + // If the elem stack is empty, then it was an empty root + if (isRoot) + gotData = false; + else { + // Restore the validation flag + fValidate = fElemStack.getValidationFlag(); + } + } + + // If we have a document handler, then tell it about this start tag. We + // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send + // any prefix since its just one big name if we are not doing namespaces. + if (fDocHandler) + { + fDocHandler->startElement + ( + *elemDecl + , fEmptyNamespaceId + , 0 + , *fAttrList + , attCount + , isEmpty + , isRoot + ); + } + + return true; +} + + +// This method is called to scan a start tag when we are processing +// namespaces. There are two different versions of this method, one for +// namespace aware processing and one for non-namespace aware processing. +// +// This method is called after we've scanned the < of a start tag. So we +// have to get the element name, then scan the attributes, after which +// we are either going to see >, />, or attributes followed by one of those +// sequences. +bool IGXMLScanner::scanStartTagNS(bool& gotData) +{ + // Assume we will still have data until proven otherwise. It will only + // ever be false if this is the root and its empty. + gotData = true; + + // Reset element content buffer + fContent.reset(); + + // The current position is after the open bracket, so we need to read in + // in the element name. + int prefixColonPos; + if (!fReaderMgr.getQName(fQNameBuf, &prefixColonPos)) + { + if (fQNameBuf.isEmpty()) + emitError(XMLErrs::ExpectedElementName); + else + emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); + fReaderMgr.skipToChar(chOpenAngle); + return false; + } + + // See if its the root element + const bool isRoot = fElemStack.isEmpty(); + + // Skip any whitespace after the name + fReaderMgr.skipPastSpaces(); + + // First we have to do the rawest attribute scan. We don't do any + // normalization of them at all, since we don't know yet what type they + // might be (since we need the element decl in order to do that.) + bool isEmpty; + XMLSize_t attCount = rawAttrScan + ( + fQNameBuf.getRawBuffer() + , *fRawAttrList + , isEmpty + ); + + // save the contentleafname and currentscope before addlevel, for later use + ContentLeafNameTypeVector* cv = 0; + XMLContentModel* cm = 0; + unsigned int currentScope = Grammar::TOP_LEVEL_SCOPE; + bool laxThisOne = false; + + if (!isRoot && fGrammarType == Grammar::SchemaGrammarType) + { + // schema validator will have correct type if validating + SchemaElementDecl* tempElement = (SchemaElementDecl*) + fElemStack.topElement()->fThisElement; + SchemaElementDecl::ModelTypes modelType = tempElement->getModelType(); + ComplexTypeInfo *currType = 0; + + if (fValidate) + { + currType = ((SchemaValidator*)fValidator)->getCurrentTypeInfo(); + if (currType) + modelType = (SchemaElementDecl::ModelTypes)currType->getContentType(); + else // something must have gone wrong + modelType = SchemaElementDecl::Any; + } + else + { + currType = tempElement->getComplexTypeInfo(); + } + + if ((modelType == SchemaElementDecl::Mixed_Simple) + || (modelType == SchemaElementDecl::Mixed_Complex) + || (modelType == SchemaElementDecl::Children)) + { + cm = currType->getContentModel(); + cv = cm->getContentLeafNameTypeVector(); + currentScope = fElemStack.getCurrentScope(); + } + else if (modelType == SchemaElementDecl::Any) { + laxThisOne = true; + } + } + + // Now, since we might have to update the namespace map for this element, + // but we don't have the element decl yet, we just tell the element stack + // to expand up to get ready. + XMLSize_t elemDepth = fElemStack.addLevel(); + fElemStack.setValidationFlag(fValidate); + fElemStack.setPrefixColonPos(prefixColonPos); + + // Check if there is any external schema location specified, and if we are at root, + // go through them first before scanning those specified in the instance document + if (isRoot && fDoSchema + && (fExternalSchemaLocation || fExternalNoNamespaceSchemaLocation)) { + + if (fExternalSchemaLocation) + parseSchemaLocation(fExternalSchemaLocation, true); + if (fExternalNoNamespaceSchemaLocation) + resolveSchemaGrammar(fExternalNoNamespaceSchemaLocation, XMLUni::fgZeroLenString, true); + } + + // Make an initial pass through the list and find any xmlns attributes or + // schema attributes. + if (attCount) { + scanRawAttrListforNameSpaces(attCount); + } + + // Also find any default or fixed xmlns attributes in DTD defined for + // this element. + XMLElementDecl* elemDecl = 0; + const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); + + if (fGrammarType == Grammar::DTDGrammarType) { + + if (!fSkipDTDValidation) { + elemDecl = fGrammar->getElemDecl( + fEmptyNamespaceId, 0, qnameRawBuf, Grammar::TOP_LEVEL_SCOPE + ); + + if (elemDecl) { + if (elemDecl->hasAttDefs()) { + XMLAttDefList& attDefList = elemDecl->getAttDefList(); + for(XMLSize_t i=0; igetByKey(qnameRawBuf); + } + } + + // Resolve the qualified name to a URI and name so that we can look up + // the element decl for this element. We have now update the prefix to + // namespace map so we should get the correct element now. + unsigned int uriId = resolveQNameWithColon( + qnameRawBuf, fPrefixBuf, ElemStack::Mode_Element, prefixColonPos + ); + + //if schema, check if we should lax or skip the validation of this element + bool parentValidation = fValidate; + if (cv) { + QName element(fPrefixBuf.getRawBuffer(), &qnameRawBuf[prefixColonPos + 1], uriId, fMemoryManager); + // elementDepth will be > 0, as cv is only constructed if element is not + // root. + laxThisOne = laxElementValidation(&element, cv, cm, elemDepth - 1); + } + + // Look up the element now in the grammar. This will get us back a + // generic element decl object. We tell him to fault one in if he does + // not find it. + bool wasAdded = false; + const XMLCh* nameRawBuf = &qnameRawBuf[prefixColonPos + 1]; + + if (fDoSchema) { + + if (fGrammarType == Grammar::DTDGrammarType) { + if (!switchGrammar(getURIText(uriId))) { + fValidator->emitError( + XMLValid::GrammarNotFound, getURIText(uriId) + ); + } + } + + if (fGrammarType == Grammar::SchemaGrammarType) { + elemDecl = fGrammar->getElemDecl( + uriId, nameRawBuf, qnameRawBuf, currentScope + ); + + // if not found, then it may be a reference, try TOP_LEVEL_SCOPE + if (!elemDecl) { + bool checkTopLevel = (currentScope != Grammar::TOP_LEVEL_SCOPE); + const XMLCh* original_uriStr = fGrammar->getTargetNamespace(); + unsigned int orgGrammarUri = fURIStringPool->getId(original_uriStr); + + if (orgGrammarUri != uriId) { + if (switchGrammar(getURIText(uriId))) { + checkTopLevel = true; + } + else { + // the laxElementValidation routine (called above) will + // set fValidate to false for a "skipped" element + if (!laxThisOne && fValidate) { + fValidator->emitError( + XMLValid::GrammarNotFound, getURIText(uriId) + ); + } + checkTopLevel = false; + } + } + + if (checkTopLevel) { + elemDecl = fGrammar->getElemDecl( + uriId, nameRawBuf, qnameRawBuf, Grammar::TOP_LEVEL_SCOPE + ); + } + + if (!elemDecl && currentScope != Grammar::TOP_LEVEL_SCOPE) { + + if (orgGrammarUri == uriId) { + // still not found in specified uri + // try emptyNamespace see if element should be + // un-qualified. + // Use a temp variable until we decide this is the case + if (uriId != fEmptyNamespaceId) { + XMLElementDecl* tempElemDecl = fGrammar->getElemDecl( + fEmptyNamespaceId, nameRawBuf, qnameRawBuf, currentScope + ); + + if (tempElemDecl && tempElemDecl->getCreateReason() != XMLElementDecl::JustFaultIn && fValidate) { + fValidator->emitError( + XMLValid::ElementNotUnQualified, qnameRawBuf + ); + elemDecl = tempElemDecl; + } + } + } + // still Not found in specified uri + // go to original Grammar again to see if element needs + // to be fully qualified. + // Use a temp variable until we decide this is the case + else if (uriId == fEmptyNamespaceId) { + + if (switchGrammar(original_uriStr)) { + XMLElementDecl* tempElemDecl = fGrammar->getElemDecl( + orgGrammarUri, nameRawBuf, qnameRawBuf, currentScope + ); + if (tempElemDecl && tempElemDecl->getCreateReason() != XMLElementDecl::JustFaultIn && fValidate) { + fValidator->emitError( + XMLValid::ElementNotQualified, qnameRawBuf + ); + elemDecl = tempElemDecl; + } + } + else if (!laxThisOne && fValidate) { + fValidator->emitError( + XMLValid::GrammarNotFound,original_uriStr + ); + } + } + } + + if (!elemDecl) { + // still not found + // switch back to original grammar first if necessary + if (orgGrammarUri != uriId) { + switchGrammar(original_uriStr); + } + + // look in the list of undeclared elements, as would have been + // done before we made grammars stateless: + elemDecl = fSchemaElemNonDeclPool->getByKey( + nameRawBuf, uriId, (int)Grammar::TOP_LEVEL_SCOPE + ); + } + } + } + } + + if (!elemDecl) { + + if (fGrammarType == Grammar::DTDGrammarType) { + elemDecl = new (fMemoryManager) DTDElementDecl( + qnameRawBuf, uriId, DTDElementDecl::Any, fMemoryManager + ); + elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); + } + else if (fGrammarType == Grammar::SchemaGrammarType) { + elemDecl = new (fMemoryManager) SchemaElementDecl( + fPrefixBuf.getRawBuffer(), nameRawBuf, uriId + , SchemaElementDecl::Any, Grammar::TOP_LEVEL_SCOPE + , fMemoryManager + ); + elemDecl->setId( + fSchemaElemNonDeclPool->put((void*)elemDecl->getBaseName() + , uriId, (int)Grammar::TOP_LEVEL_SCOPE, (SchemaElementDecl*)elemDecl) + ); + } else { + fValidator->emitError( + XMLValid::GrammarNotFound, getURIText(uriId) + ); + } + wasAdded = true; + } + + // this info needed for DOMTypeInfo + fPSVIElemContext.fErrorOccurred = false; + + // We do something different here according to whether we found the + // element or not. + bool bXsiTypeSet= (fValidator && fGrammarType == Grammar::SchemaGrammarType)?((SchemaValidator*)fValidator)->getIsXsiTypeSet():false; + if (wasAdded) + { + if (laxThisOne && !bXsiTypeSet) { + fValidate = false; + fElemStack.setValidationFlag(fValidate); + } + else if (fValidate) + { + // If validating then emit an error + + // This is to tell the reuse Validator that this element was + // faulted-in, was not an element in the grammar pool originally + elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); + + // xsi:type was specified, don't complain about missing definition + if(!bXsiTypeSet) + { + fValidator->emitError + ( + XMLValid::ElementNotDefined + , elemDecl->getFullName() + ); + + if(fGrammarType == Grammar::SchemaGrammarType) + { + fPSVIElemContext.fErrorOccurred = true; + } + } + } + } + else + { + // If its not marked declared and validating, then emit an error + if (!elemDecl->isDeclared()) { + if(elemDecl->getCreateReason() == XMLElementDecl::NoReason) { + if(!bXsiTypeSet && fGrammarType == Grammar::SchemaGrammarType) { + fPSVIElemContext.fErrorOccurred = true; + } + } + + if (laxThisOne) { + fValidate = false; + fElemStack.setValidationFlag(fValidate); + } + else if (fValidate && !bXsiTypeSet) + { + fValidator->emitError + ( + XMLValid::ElementNotDefined + , elemDecl->getFullName() + ); + } + } + } + + // Now we can update the element stack to set the current element + // decl. We expanded the stack above, but couldn't store the element + // decl because we didn't know it yet. + fElemStack.setElement(elemDecl, fReaderMgr.getCurrentReaderNum()); + fElemStack.setCurrentURI(uriId); + + if (isRoot) + { + fRootGrammar = fGrammar; + if (fGrammarType == Grammar::SchemaGrammarType && !fRootElemName) + fRootElemName = XMLString::replicate(qnameRawBuf, fMemoryManager); + } + + if (fGrammarType == Grammar::SchemaGrammarType && fPSVIHandler) + { + + fPSVIElemContext.fElemDepth++; + if (elemDecl->isDeclared()) + { + fPSVIElemContext.fNoneValidationDepth = fPSVIElemContext.fElemDepth; + } + else + { + fPSVIElemContext.fFullValidationDepth = fPSVIElemContext.fElemDepth; + + /****** + * While we report an error for historical reasons, this should + * actually result in lax assessment - NG. + if (isRoot && fValidate) + fPSVIElemContext.fErrorOccurred = true; + *****/ + } + } + + // Validate the element + if (fValidate) + { + fValidator->validateElement(elemDecl); + if (fValidator->handlesSchema()) + { + if (((SchemaValidator*) fValidator)->getErrorOccurred()) + fPSVIElemContext.fErrorOccurred = true; + } + } + + if (fGrammarType == Grammar::SchemaGrammarType) { + + // squirrel away the element's QName, so that we can do an efficient + // end-tag match + fElemStack.setCurrentSchemaElemName(fQNameBuf.getRawBuffer()); + + ComplexTypeInfo* typeinfo = (fValidate) + ? ((SchemaValidator*)fValidator)->getCurrentTypeInfo() + : ((SchemaElementDecl*) elemDecl)->getComplexTypeInfo(); + + if (typeinfo) { + currentScope = typeinfo->getScopeDefined(); + + // switch grammar if the typeinfo has a different grammar (happens when there is xsi:type) + XMLCh* typeName = typeinfo->getTypeName(); + const int comma = XMLString::indexOf(typeName, chComma); + if (comma > 0) { + XMLBuffer prefixBuf(comma+1, fMemoryManager); + prefixBuf.append(typeName, comma); + const XMLCh* uriStr = prefixBuf.getRawBuffer(); + + bool errorCondition = !switchGrammar(uriStr) && fValidate; + if (errorCondition && !laxThisOne) + { + fValidator->emitError + ( + XMLValid::GrammarNotFound + , prefixBuf.getRawBuffer() + ); + } + } + else if (comma == 0) { + bool errorCondition = !switchGrammar(XMLUni::fgZeroLenString) && fValidate; + if (errorCondition && !laxThisOne) + { + fValidator->emitError + ( + XMLValid::GrammarNotFound + , XMLUni::fgZeroLenString + ); + } + } + } + fElemStack.setCurrentScope(currentScope); + + // Set element next state + if (elemDepth >= fElemStateSize) { + resizeElemState(); + } + + fElemState[elemDepth] = 0; + fElemLoopState[elemDepth] = 0; + } + + fElemStack.setCurrentGrammar(fGrammar); + + // If this is the first element and we are validating, check the root + // element. + if (isRoot) + { + if (fValidate) + { + // If a DocType exists, then check if it matches the root name there. + if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) + fValidator->emitError(XMLValid::RootElemNotLikeDocType); + } + } + else if (parentValidation) + { + // If the element stack is not empty, then add this element as a + // child of the previous top element. If its empty, this is the root + // elem and is not the child of anything. + fElemStack.addChild(elemDecl->getElementName(), true); + } + + // PSVI handling: even if it turns out there are + // no attributes, we need to reset this list... + if(getPSVIHandler() && fGrammarType == Grammar::SchemaGrammarType ) + fPSVIAttrList->reset(); + + // Now lets get the fAttrList filled in. This involves faulting in any + // defaulted and fixed attributes and normalizing the values of any that + // we got explicitly. + // + // We update the attCount value with the total number of attributes, but + // it goes in with the number of values we got during the raw scan of + // explictly provided attrs above. + attCount = buildAttList(*fRawAttrList, attCount, elemDecl, *fAttrList); + if(attCount) + { + // clean up after ourselves: + // clear the map used to detect duplicate attributes + fUndeclaredAttrRegistry->removeAll(); + } + + // activate identity constraints + if (fGrammar && + fGrammarType == Grammar::SchemaGrammarType && + toCheckIdentityConstraint()) + { + fICHandler->activateIdentityConstraint + ( + (SchemaElementDecl*) elemDecl + , (int) elemDepth + , uriId + , fPrefixBuf.getRawBuffer() + , *fAttrList + , attCount + , fValidationContext + ); + } + + // Since the element may have default values, call start tag now regardless if it is empty or not + // If we have a document handler, then tell it about this start tag + if (fDocHandler) + { + fDocHandler->startElement + ( + *elemDecl + , uriId + , fPrefixBuf.getRawBuffer() + , *fAttrList + , attCount + , false + , isRoot + ); + } + + // if we have a PSVIHandler, now's the time to call + // its handleAttributesPSVI method: + if(fPSVIHandler && fGrammarType == Grammar::SchemaGrammarType) + { + QName *eName = elemDecl->getElementName(); + fPSVIHandler->handleAttributesPSVI + ( + eName->getLocalPart() + , fURIStringPool->getValueForId(eName->getURI()) + , fPSVIAttrList + ); + } + + // If empty, validate content right now if we are validating and then + // pop the element stack top. Else, we have to update the current stack + // top's namespace mapping elements. + if (isEmpty) + { + // Pop the element stack back off since it'll never be used now + fElemStack.popTop(); + + // reset current type info + DatatypeValidator* psviMemberType = 0; + if (fGrammarType == Grammar::SchemaGrammarType) + { + if (fValidate && elemDecl->isDeclared()) + { + fPSVIElemContext.fCurrentTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); + if(!fPSVIElemContext.fCurrentTypeInfo) + fPSVIElemContext.fCurrentDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); + else + fPSVIElemContext.fCurrentDV = 0; + if(fPSVIHandler) + { + fPSVIElemContext.fNormalizedValue = ((SchemaValidator*) fValidator)->getNormalizedValue(); + + if (XMLString::equals(fPSVIElemContext.fNormalizedValue, XMLUni::fgZeroLenString)) + fPSVIElemContext.fNormalizedValue = 0; + } + } + else + { + fPSVIElemContext.fCurrentDV = 0; + fPSVIElemContext.fCurrentTypeInfo = 0; + fPSVIElemContext.fNormalizedValue = 0; + } + } + + // If validating, then insure that its legal to have no content + if (fValidate) + { + XMLSize_t failure; + bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); + if (!res) + { + fValidator->emitError + ( + XMLValid::ElementNotValidForContent + , elemDecl->getFullName() + , elemDecl->getFormattedContentModel() + ); + } + + if (fGrammarType == Grammar::SchemaGrammarType) { + + if (((SchemaValidator*) fValidator)->getErrorOccurred()) + { + fPSVIElemContext.fErrorOccurred = true; + } + else + { + if (fPSVIHandler) + { + fPSVIElemContext.fIsSpecified = ((SchemaValidator*) fValidator)->getIsElemSpecified(); + if(fPSVIElemContext.fIsSpecified) + fPSVIElemContext.fNormalizedValue = ((SchemaElementDecl *)elemDecl)->getDefaultValue(); + } + // note that if we're empty, won't be a current DV + if (fPSVIElemContext.fCurrentDV && fPSVIElemContext.fCurrentDV->getType() == DatatypeValidator::Union) + psviMemberType = fValidationContext->getValidatingMemberType(); + } + + // call matchers and de-activate context + if (toCheckIdentityConstraint()) + { + fICHandler->deactivateContext + ( + (SchemaElementDecl *) elemDecl + , fContent.getRawBuffer() + , fValidationContext + , fPSVIElemContext.fCurrentDV + ); + } + + } + } + else if (fGrammarType == Grammar::SchemaGrammarType) { + ((SchemaValidator*)fValidator)->resetNillable(); + } + + if (fGrammarType == Grammar::SchemaGrammarType) + { + if (fPSVIHandler) + { + endElementPSVI((SchemaElementDecl*)elemDecl, psviMemberType); + } + } + + // If we have a doc handler, tell it about the end tag + if (fDocHandler) + { + fDocHandler->endElement + ( + *elemDecl + , uriId + , isRoot + , fPrefixBuf.getRawBuffer() + ); + } + + // If the elem stack is empty, then it was an empty root + if (isRoot) + gotData = false; + else + { + // Restore the grammar + fGrammar = fElemStack.getCurrentGrammar(); + fGrammarType = fGrammar->getGrammarType(); + if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) { + if (fValidatorFromUser) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoSchemaValidator, fMemoryManager); + else { + fValidator = fSchemaValidator; + } + } + else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) { + if (fValidatorFromUser) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); + else { + fValidator = fDTDValidator; + } + } + + fValidator->setGrammar(fGrammar); + + // Restore the validation flag + fValidate = fElemStack.getValidationFlag(); + } + } + else if (fGrammarType == Grammar::SchemaGrammarType) + { + // send a partial element psvi + if (fPSVIHandler) + { + + ComplexTypeInfo* curTypeInfo = 0; + DatatypeValidator* curDV = 0; + XSTypeDefinition* typeDef = 0; + + if (fValidate && elemDecl->isDeclared()) + { + curTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); + + if (curTypeInfo) + { + typeDef = (XSTypeDefinition*) fModel->getXSObject(curTypeInfo); + } + else + { + curDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); + + if (curDV) + { + typeDef = (XSTypeDefinition*) fModel->getXSObject(curDV); + } + } + } + + fPSVIElement->reset + ( + PSVIElement::VALIDITY_NOTKNOWN + , PSVIElement::VALIDATION_NONE + , fRootElemName + , ((SchemaValidator*) fValidator)->getIsElemSpecified() + , (elemDecl->isDeclared()) ? (XSElementDeclaration*) fModel->getXSObject(elemDecl) : 0 + , typeDef + , 0 //memberType + , fModel + , ((SchemaElementDecl*)elemDecl)->getDefaultValue() + , 0 + , 0 + , 0 + ); + + + fPSVIHandler->handlePartialElementPSVI + ( + elemDecl->getBaseName() + , fURIStringPool->getValueForId(elemDecl->getURI()) + , fPSVIElement + ); + + } + + // not empty + fErrorStack->push(fPSVIElemContext.fErrorOccurred); + } + + return true; +} + + +// --------------------------------------------------------------------------- +// IGXMLScanner: Helper methos +// --------------------------------------------------------------------------- +void IGXMLScanner::resizeElemState() { + + unsigned int newSize = fElemStateSize * 2; + unsigned int* newElemState = (unsigned int*) fMemoryManager->allocate + ( + newSize * sizeof(unsigned int) + ); //new unsigned int[newSize]; + unsigned int* newElemLoopState = (unsigned int*) fMemoryManager->allocate + ( + newSize * sizeof(unsigned int) + ); //new unsigned int[newSize]; + + // Copy the existing values + unsigned int index = 0; + for (; index < fElemStateSize; index++) + { + newElemState[index] = fElemState[index]; + newElemLoopState[index] = fElemLoopState[index]; + } + + for (; index < newSize; index++) + newElemLoopState[index] = newElemState[index] = 0; + + // Delete the old array and udpate our members + fMemoryManager->deallocate(fElemState); //delete [] fElemState; + fMemoryManager->deallocate(fElemLoopState); //delete [] fElemState; + fElemState = newElemState; + fElemLoopState = newElemLoopState; + fElemStateSize = newSize; +} + +void IGXMLScanner::resizeRawAttrColonList() { + + unsigned int newSize = fRawAttrColonListSize * 2; + int* newRawAttrColonList = (int*) fMemoryManager->allocate + ( + newSize * sizeof(int) + ); //new int[newSize]; + + // Copy the existing values + unsigned int index = 0; + for (; index < fRawAttrColonListSize; index++) + newRawAttrColonList[index] = fRawAttrColonList[index]; + + // Delete the old array and udpate our members + fMemoryManager->deallocate(fRawAttrColonList); //delete [] fRawAttrColonList; + fRawAttrColonList = newRawAttrColonList; + fRawAttrColonListSize = newSize; +} + +// --------------------------------------------------------------------------- +// IGXMLScanner: Grammar preparsing +// --------------------------------------------------------------------------- +Grammar* IGXMLScanner::loadGrammar(const InputSource& src + , const short grammarType + , const bool toCache) +{ + Grammar* loadedGrammar = 0; + + ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); + + try + { + fGrammarResolver->cacheGrammarFromParse(false); + // if the new grammar has to be cached, better use the already cached + // grammars, or the an exception will be thrown when caching an already + // cached grammar + fGrammarResolver->useCachedGrammarInParse(toCache); + fRootGrammar = 0; + + if (fValScheme == Val_Auto) { + fValidate = true; + } + + // Reset some status flags + fInException = false; + fStandalone = false; + fErrorCount = 0; + fHasNoDTD = true; + fSeeXsi = false; + + if (grammarType == Grammar::SchemaGrammarType) { + loadedGrammar = loadXMLSchemaGrammar(src, toCache); + } + else if (grammarType == Grammar::DTDGrammarType) { + loadedGrammar = loadDTDGrammar(src, toCache); + } + } + // NOTE: + // + // In all of the error processing below, the emitError() call MUST come + // before the flush of the reader mgr, or it will fail because it tries + // to find out the position in the XML source of the error. + catch(const XMLErrs::Codes) + { + // This is a 'first fatal error' type exit, so fall through + } + catch(const XMLValid::Codes) + { + // This is a 'first fatal error' type exit, so fall through + } + catch(const XMLException& excToCatch) + { + // Emit the error and catch any user exception thrown from here. Make + // sure in all cases we flush the reader manager. + fInException = true; + try + { + if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) + emitError + ( + XMLErrs::XMLException_Warning + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) + emitError + ( + XMLErrs::XMLException_Fatal + , excToCatch.getCode() + , excToCatch.getMessage() + ); + else + emitError + ( + XMLErrs::XMLException_Error + , excToCatch.getCode() + , excToCatch.getMessage() + ); + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + } + catch(const OutOfMemoryException&) + { + // This is a special case for out-of-memory + // conditions, because resetting the ReaderMgr + // can be problematic. + resetReaderMgr.release(); + + throw; + } + + return loadedGrammar; +} + +void IGXMLScanner::resetCachedGrammar () +{ + fCachedSchemaInfoList->removeAll (); +} + +Grammar* IGXMLScanner::loadDTDGrammar(const InputSource& src, + const bool toCache) +{ + // Reset the validators + fDTDValidator->reset(); + if (fValidatorFromUser) + fValidator->reset(); + + if (!fValidator->handlesDTD()) { + if (fValidatorFromUser && fValidate) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); + else { + fValidator = fDTDValidator; + } + } + + fDTDGrammar = (DTDGrammar*) fGrammarResolver->getGrammar(XMLUni::fgDTDEntityString); + + if (fDTDGrammar) { + fDTDGrammar->reset(); + } + else { + fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); + fGrammarResolver->putGrammar(fDTDGrammar); + } + + fGrammar = fDTDGrammar; + fGrammarType = fGrammar->getGrammarType(); + fValidator->setGrammar(fGrammar); + + // And for all installed handlers, send reset events. This gives them + // a chance to flush any cached data. + if (fDocHandler) + fDocHandler->resetDocument(); + if (fEntityHandler) + fEntityHandler->resetEntities(); + if (fErrorReporter) + fErrorReporter->resetErrors(); + + // Clear out the id reference list + resetValidationContext(); + // and clear out the darned undeclared DTD element pool... + fDTDElemNonDeclPool->removeAll(); + + if (toCache) { + + unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId()); + const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId); + + fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); + ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); + fGrammarResolver->putGrammar(fGrammar); + } + + // Handle the creation of the XML reader object for this input source. + // This will provide us with transcoding and basic lexing services. + XMLReader* newReader = fReaderMgr.createReader + ( + src + , false + , XMLReader::RefFrom_NonLiteral + , XMLReader::Type_General + , XMLReader::Source_External + , fCalculateSrcOfs + , fLowWaterMark + ); + if (!newReader) { + if (src.getIssueFatalErrorIfNotFound()) + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); + else + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); + } + + // In order to make the processing work consistently, we have to + // make this look like an external entity. So create an entity + // decl and fill it in and push it with the reader, as happens + // with an external entity. Put a janitor on it to insure it gets + // cleaned up. The reader manager does not adopt them. + const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; + DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); + declDTD->setSystemId(src.getSystemId()); + declDTD->setIsExternal(true); + + // Mark this one as a throw at end + newReader->setThrowAtEnd(true); + + // And push it onto the stack, with its pseudo name + fReaderMgr.pushReader(newReader, declDTD); + + // If we have a doc type handler and advanced callbacks are enabled, + // call the doctype event. + if (fDocTypeHandler) { + + // Create a dummy root + DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl + ( + gDTDStr + , fEmptyNamespaceId + , DTDElementDecl::Any + , fGrammarPoolMemoryManager + ); + rootDecl->setCreateReason(DTDElementDecl::AsRootElem); + rootDecl->setExternalElemDeclaration(true); + Janitor janSrc(rootDecl); + + fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true); + } + + // Create DTDScanner + DTDScanner dtdScanner + ( + (DTDGrammar*) fGrammar + , fDocTypeHandler + , fGrammarPoolMemoryManager + , fMemoryManager + ); + dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); + + // Tell it its not in an include section + dtdScanner.scanExtSubsetDecl(false, true); + + if (fValidate) { + // validate the DTD scan so far + fValidator->preContentValidation(false, true); + } + + if (toCache) + fGrammarResolver->cacheGrammars(); + + return fDTDGrammar; +} + +// --------------------------------------------------------------------------- +// IGXMLScanner: Helper methods +// --------------------------------------------------------------------------- +void IGXMLScanner::processSchemaLocation(XMLCh* const schemaLoc) +{ + XMLCh* locStr = schemaLoc; + XMLReader* curReader = fReaderMgr.getCurrentReader(); + + fLocationPairs->removeAllElements(); + while (*locStr) + { + do { + // Do we have an escaped character ? + if (*locStr == 0xFFFF) + continue; + + if (!curReader->isWhitespace(*locStr)) + break; + + *locStr = chNull; + } while (*++locStr); + + if (*locStr) { + + fLocationPairs->addElement(locStr); + + while (*++locStr) { + // Do we have an escaped character ? + if (*locStr == 0xFFFF) + continue; + if (curReader->isWhitespace(*locStr)) + break; + } + } + } +} + +void IGXMLScanner::endElementPSVI(SchemaElementDecl* const elemDecl, + DatatypeValidator* const memberDV) +{ + PSVIElement::ASSESSMENT_TYPE validationAttempted; + PSVIElement::VALIDITY_STATE validity = PSVIElement::VALIDITY_NOTKNOWN; + + if (fPSVIElemContext.fElemDepth > fPSVIElemContext.fFullValidationDepth) + validationAttempted = PSVIElement::VALIDATION_FULL; + else if (fPSVIElemContext.fElemDepth > fPSVIElemContext.fNoneValidationDepth) + validationAttempted = PSVIElement::VALIDATION_NONE; + else + { + validationAttempted = PSVIElement::VALIDATION_PARTIAL; + fPSVIElemContext.fFullValidationDepth = + fPSVIElemContext.fNoneValidationDepth = fPSVIElemContext.fElemDepth - 1; + } + + if (fValidate && elemDecl->isDeclared()) + { + validity = (fPSVIElemContext.fErrorOccurred) + ? PSVIElement::VALIDITY_INVALID : PSVIElement::VALIDITY_VALID; + } + + XSTypeDefinition* typeDef = 0; + bool isMixed = false; + if (fPSVIElemContext.fCurrentTypeInfo) + { + typeDef = (XSTypeDefinition*) fModel->getXSObject(fPSVIElemContext.fCurrentTypeInfo); + SchemaElementDecl::ModelTypes modelType = (SchemaElementDecl::ModelTypes)fPSVIElemContext.fCurrentTypeInfo->getContentType(); + isMixed = (modelType == SchemaElementDecl::Mixed_Simple + || modelType == SchemaElementDecl::Mixed_Complex); + } + else if (fPSVIElemContext.fCurrentDV) + typeDef = (XSTypeDefinition*) fModel->getXSObject(fPSVIElemContext.fCurrentDV); + + XMLCh* canonicalValue = 0; + if (fPSVIElemContext.fNormalizedValue && !isMixed && + validity == PSVIElement::VALIDITY_VALID) + { + if (memberDV) + canonicalValue = (XMLCh*) memberDV->getCanonicalRepresentation(fPSVIElemContext.fNormalizedValue, fMemoryManager); + else if (fPSVIElemContext.fCurrentDV) + canonicalValue = (XMLCh*) fPSVIElemContext.fCurrentDV->getCanonicalRepresentation(fPSVIElemContext.fNormalizedValue, fMemoryManager); + } + + fPSVIElement->reset + ( + validity + , validationAttempted + , fRootElemName + , fPSVIElemContext.fIsSpecified + , (elemDecl->isDeclared()) + ? (XSElementDeclaration*) fModel->getXSObject(elemDecl) : 0 + , typeDef + , (memberDV) ? (XSSimpleTypeDefinition*) fModel->getXSObject(memberDV) : 0 + , fModel + , elemDecl->getDefaultValue() + , fPSVIElemContext.fNormalizedValue + , canonicalValue + ); + + fPSVIHandler->handleElementPSVI + ( + elemDecl->getBaseName() + , fURIStringPool->getValueForId(elemDecl->getURI()) + , fPSVIElement + ); + + // decrease element depth + fPSVIElemContext.fElemDepth--; + +} + +void IGXMLScanner::resetPSVIElemContext() +{ + fPSVIElemContext.fIsSpecified = false; + fPSVIElemContext.fErrorOccurred = false; + fPSVIElemContext.fElemDepth = -1; + fPSVIElemContext.fFullValidationDepth = -1; + fPSVIElemContext.fNoneValidationDepth = -1; + fPSVIElemContext.fCurrentDV = 0; + fPSVIElemContext.fCurrentTypeInfo = 0; + fPSVIElemContext.fNormalizedValue = 0; +} + +} From 0fb528c9c3a60a664cb8b36986999b60521a6008 Mon Sep 17 00:00:00 2001 From: johnjamesmccann <98098904+johnjamesmccann@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:46:42 +0000 Subject: [PATCH 2/5] Delete DGXMLScanner.cpp --- DGXMLScanner.cpp | 3578 ---------------------------------------------- 1 file changed, 3578 deletions(-) delete mode 100644 DGXMLScanner.cpp diff --git a/DGXMLScanner.cpp b/DGXMLScanner.cpp deleted file mode 100644 index ae8076d5b..000000000 --- a/DGXMLScanner.cpp +++ /dev/null @@ -1,3578 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * $Id$ - */ - -// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens -// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. - -// --------------------------------------------------------------------------- -// Includes -// --------------------------------------------------------------------------- -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace XERCES_CPP_NAMESPACE { - - -typedef JanitorMemFunCall CleanupType; -typedef JanitorMemFunCall ReaderMgrResetType; - - -// --------------------------------------------------------------------------- -// DGXMLScanner: Constructors and Destructor -// --------------------------------------------------------------------------- -DGXMLScanner::DGXMLScanner(XMLValidator* const valToAdopt - , GrammarResolver* const grammarResolver - , MemoryManager* const manager) : - - XMLScanner(valToAdopt, grammarResolver, manager) - , fAttrNSList(0) - , fDTDValidator(0) - , fDTDGrammar(0) - , fDTDElemNonDeclPool(0) - , fElemCount(0) - , fAttDefRegistry(0) - , fUndeclaredAttrRegistry(0) -{ - CleanupType cleanup(this, &DGXMLScanner::cleanUp); - - try - { - commonInit(); - } - catch(const OutOfMemoryException&) - { - // Don't cleanup when out of memory, since executing the - // code can cause problems. - cleanup.release(); - - throw; - } - - cleanup.release(); -} - -DGXMLScanner::DGXMLScanner( XMLDocumentHandler* const docHandler - , DocTypeHandler* const docTypeHandler - , XMLEntityHandler* const entityHandler - , XMLErrorReporter* const errHandler - , XMLValidator* const valToAdopt - , GrammarResolver* const grammarResolver - , MemoryManager* const manager) : - - XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) - , fAttrNSList(0) - , fDTDValidator(0) - , fDTDGrammar(0) - , fDTDElemNonDeclPool(0) - , fElemCount(0) - , fAttDefRegistry(0) - , fUndeclaredAttrRegistry(0) -{ - CleanupType cleanup(this, &DGXMLScanner::cleanUp); - - try - { - commonInit(); - } - catch(const OutOfMemoryException&) - { - // Don't cleanup when out of memory, since executing the - // code can cause problems. - cleanup.release(); - - throw; - } - - cleanup.release(); -} - -DGXMLScanner::~DGXMLScanner() -{ - cleanUp(); -} - -// --------------------------------------------------------------------------- -// XMLScanner: Getter methods -// --------------------------------------------------------------------------- -NameIdPool* DGXMLScanner::getEntityDeclPool() -{ - if(!fGrammar) - return 0; - return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); -} - -const NameIdPool* DGXMLScanner::getEntityDeclPool() const -{ - if(!fGrammar) - return 0; - return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); -} - -// --------------------------------------------------------------------------- -// DGXMLScanner: Main entry point to scan a document -// --------------------------------------------------------------------------- -void DGXMLScanner::scanDocument(const InputSource& src) -{ - // Bump up the sequence id for this parser instance. This will invalidate - // any previous progressive scan tokens. - fSequenceId++; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - try - { - // Reset the scanner and its plugged in stuff for a new run. This - // resets all the data structures, creates the initial reader and - // pushes it on the stack, and sets up the base document path. - scanReset(src); - - // If we have a document handler, then call the start document - if (fDocHandler) - fDocHandler->startDocument(); - - // Scan the prolog part, which is everything before the root element - // including the DTD subsets. - scanProlog(); - - // If we got to the end of input, then its not a valid XML file. - // Else, go on to scan the content. - if (fReaderMgr.atEOF()) - { - emitError(XMLErrs::EmptyMainEntity); - } - else - { - // Scan content, and tell it its not an external entity - if (scanContent()) - { - // Do post-parse validation if required - if (fValidate) - { - // We handle ID reference semantics at this level since - // its required by XML 1.0. - checkIDRefs(); - - // Then allow the validator to do any extra stuff it wants -// fValidator->postParseValidation(); - } - - // That went ok, so scan for any miscellaneous stuff - if (!fReaderMgr.atEOF()) - scanMiscellaneous(); - } - } - - // If we have a document handler, then call the end document - if (fDocHandler) - fDocHandler->endDocument(); - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first failure' exception, so fall through - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so fall through - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } -} - - -bool DGXMLScanner::scanNext(XMLPScanToken& token) -{ - // Make sure this token is still legal - if (!isLegalToken(token)) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); - - // Find the next token and remember the reader id - XMLSize_t orgReader; - XMLTokens curToken; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - bool retVal = true; - - try - { - while (true) - { - // We have to handle any end of entity exceptions that happen here. - // We could be at the end of X nested entities, each of which will - // generate an end of entity exception as we try to move forward. - try - { - curToken = senseNextToken(orgReader); - break; - } - catch(const EndOfEntityException& toCatch) - { - // Send an end of entity reference event - if (fDocHandler) - fDocHandler->endEntityReference(toCatch.getEntity()); - } - } - - if (curToken == Token_CharData) - { - scanCharData(fCDataBuf); - } - else if (curToken == Token_EOF) - { - if (!fElemStack.isEmpty()) - { - const ElemStack::StackElem* topElem = fElemStack.popTop(); - emitError - ( - XMLErrs::EndedWithTagsOnStack - , topElem->fThisElement->getFullName() - ); - } - - retVal = false; - } - else - { - // Its some sort of markup - bool gotData = true; - switch(curToken) - { - case Token_CData : - // Make sure we are within content - if (fElemStack.isEmpty()) - emitError(XMLErrs::CDATAOutsideOfContent); - scanCDSection(); - break; - - case Token_Comment : - scanComment(); - break; - - case Token_EndTag : - scanEndTag(gotData); - break; - - case Token_PI : - scanPI(); - break; - - case Token_StartTag : - if (fDoNamespaces) - scanStartTagNS(gotData); - else - scanStartTag(gotData); - break; - - default : - fReaderMgr.skipToChar(chOpenAngle); - break; - } - - if (orgReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - // If we hit the end, then do the miscellaneous part - if (!gotData) - { - // Do post-parse validation if required - if (fValidate) - { - // We handle ID reference semantics at this level since - // its required by XML 1.0. - checkIDRefs(); - - // Then allow the validator to do any extra stuff it wants -// fValidator->postParseValidation(); - } - - // That went ok, so scan for any miscellaneous stuff - scanMiscellaneous(); - - if (fDocHandler) - fDocHandler->endDocument(); - } - } - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first failure' exception, so return failure - retVal = false; - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so return failure - retVal = false; - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - retVal = false; - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - // If we are not at the end, release the object that will - // reset the ReaderMgr. - if (retVal) - resetReaderMgr.release(); - - return retVal; -} - - -// --------------------------------------------------------------------------- -// DGXMLScanner: Private scanning methods -// --------------------------------------------------------------------------- - -// This method will kick off the scanning of the primary content of the -// document, i.e. the elements. -bool DGXMLScanner::scanContent() -{ - // Go into a loop until we hit the end of the root element, or we fall - // out because there is no root element. - // - // We have to do kind of a deeply nested double loop here in order to - // avoid doing the setup/teardown of the exception handler on each - // round. Doing it this way we only do it when an exception actually - // occurs. - bool gotData = true; - bool inMarkup = false; - while (gotData) - { - try - { - while (gotData) - { - // Sense what the next top level token is. According to what - // this tells us, we will call something to handle that kind - // of thing. - XMLSize_t orgReader; - const XMLTokens curToken = senseNextToken(orgReader); - - // Handle character data and end of file specially. Char data - // is not markup so we don't want to handle it in the loop - // below. - if (curToken == Token_CharData) - { - // Scan the character data and call appropriate events. Let - // him use our local character data buffer for efficiency. - scanCharData(fCDataBuf); - continue; - } - else if (curToken == Token_EOF) - { - // The element stack better be empty at this point or we - // ended prematurely before all elements were closed. - if (!fElemStack.isEmpty()) - { - const ElemStack::StackElem* topElem = fElemStack.popTop(); - emitError - ( - XMLErrs::EndedWithTagsOnStack - , topElem->fThisElement->getFullName() - ); - } - - // Its the end of file, so clear the got data flag - gotData = false; - continue; - } - - // We are in some sort of markup now - inMarkup = true; - - // According to the token we got, call the appropriate - // scanning method. - switch(curToken) - { - case Token_CData : - // Make sure we are within content - if (fElemStack.isEmpty()) - emitError(XMLErrs::CDATAOutsideOfContent); - scanCDSection(); - break; - - case Token_Comment : - scanComment(); - break; - - case Token_EndTag : - scanEndTag(gotData); - break; - - case Token_PI : - scanPI(); - break; - - case Token_StartTag : - if (fDoNamespaces) - scanStartTagNS(gotData); - else - scanStartTag(gotData); - break; - - default : - fReaderMgr.skipToChar(chOpenAngle); - break; - } - - if (orgReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - // And we are back out of markup again - inMarkup = false; - } - } - catch(const EndOfEntityException& toCatch) - { - // If we were in some markup when this happened, then its a - // partial markup error. - if (inMarkup) - emitError(XMLErrs::PartialMarkupInEntity); - - // Send an end of entity reference event - if (fDocHandler) - fDocHandler->endEntityReference(toCatch.getEntity()); - - inMarkup = false; - } - } - - // It went ok, so return success - return true; -} - - -void DGXMLScanner::scanEndTag(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the end of the root element. - gotData = true; - - // Check if the element stack is empty. If so, then this is an unbalanced - // element (i.e. more ends than starts, perhaps because of bad text - // causing one to be skipped.) - if (fElemStack.isEmpty()) - { - emitError(XMLErrs::MoreEndThanStartTags); - fReaderMgr.skipPastChar(chCloseAngle); - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); - } - - // Pop the stack of the element we are supposed to be ending. Remember - // that we don't own this. The stack just keeps them and reuses them. - unsigned int uriId = (fDoNamespaces) - ? fElemStack.getCurrentURI() : fEmptyNamespaceId; - - // Pop the stack of the element we are supposed to be ending. Remember - // that we don't own this. The stack just keeps them and reuses them. - const ElemStack::StackElem* topElem = fElemStack.popTop(); - XMLElementDecl *tempElement = topElem->fThisElement; - - // See if it was the root element, to avoid multiple calls below - const bool isRoot = fElemStack.isEmpty(); - - // Make sure that its the end of the element that we expect - if (!fReaderMgr.skippedStringLong(tempElement->getFullName())) - { - emitError - ( - XMLErrs::ExpectedEndOfTagX - , tempElement->getFullName() - ); - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Make sure we are back on the same reader as where we started - if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialTagMarkupError); - - // Skip optional whitespace - fReaderMgr.skipPastSpaces(); - - // Make sure we find the closing bracket - if (!fReaderMgr.skippedChar(chCloseAngle)) - { - emitError - ( - XMLErrs::UnterminatedEndTag - , topElem->fThisElement->getFullName() - ); - } - - // If validation is enabled, then lets pass him the list of children and - // this element and let him validate it. - if (fValidate) - { - - // - // XML1.0-3rd - // Validity Constraint: - // The declaration matches EMPTY and the element has no content (not even - // entity references, comments, PIs or white space). - // - if ( (topElem->fCommentOrPISeen) && - (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty)) - { - fValidator->emitError - ( - XMLValid::EmptyElemHasContent - , topElem->fThisElement->getFullName() - ); - } - - // - // XML1.0-3rd - // Validity Constraint: - // - // The declaration matches children and the sequence of child elements - // belongs to the language generated by the regular expression in the - // content model, with optional white space, comments and PIs - // (i.e. markup matching production [27] Misc) between the start-tag and - // the first child element, between child elements, or between the last - // child element and the end-tag. - // - // Note that - // a CDATA section containing only white space or - // a reference to an entity whose replacement text is character references - // expanding to white space do not match the nonterminal S, and hence - // cannot appear in these positions; however, - // a reference to an internal entity with a literal value consisting - // of character references expanding to white space does match S, - // since its replacement text is the white space resulting from expansion - // of the character references. - // - if ( (topElem->fReferenceEscaped) && - (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children)) - { - fValidator->emitError - ( - XMLValid::ElemChildrenHasInvalidWS - , topElem->fThisElement->getFullName() - ); - } - - XMLSize_t failure; - bool res = fValidator->checkContent - ( - topElem->fThisElement - , topElem->fChildren - , topElem->fChildCount - , &failure - ); - - if (!res) - { - // One of the elements is not valid for the content. NOTE that - // if no children were provided but the content model requires - // them, it comes back with a zero value. But we cannot use that - // to index the child array in this case, and have to put out a - // special message. - if (!topElem->fChildCount) - { - fValidator->emitError - ( - XMLValid::EmptyNotValidForContent - , topElem->fThisElement->getFormattedContentModel() - ); - } - else if (failure >= topElem->fChildCount) - { - fValidator->emitError - ( - XMLValid::NotEnoughElemsForCM - , topElem->fThisElement->getFormattedContentModel() - ); - } - else - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , topElem->fChildren[failure]->getRawName() - , topElem->fThisElement->getFormattedContentModel() - ); - } - } - } - - // If we have a doc handler, tell it about the end tag - if (fDocHandler) - { - fDocHandler->endElement - ( - *topElem->fThisElement - , uriId - , isRoot - , (fDoNamespaces) - ? topElem->fThisElement->getElementName()->getPrefix() - : XMLUni::fgZeroLenString - ); - } - - // If this was the root, then done with content - gotData = !isRoot; -} - - -// This method handles the high level logic of scanning the DOCType -// declaration. This calls the DTDScanner and kicks off both the scanning of -// the internal subset and the scanning of the external subset, if any. -// -// When we get here the 'resetDocType(); - - // There must be some space after DOCTYPE - bool skippedSomething; - fReaderMgr.skipPastSpaces(skippedSomething); - if (!skippedSomething) - { - emitError(XMLErrs::ExpectedWhitespace); - - // Just skip the Doctype declaration and return - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Get a buffer for the root element - XMLBufBid bbRootName(&fBufMgr); - - // Get a name from the input, which should be the name of the root - // element of the upcoming content. - int colonPosition; - bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) : - fReaderMgr.getName(bbRootName.getBuffer()); - if (!validName) - { - if (bbRootName.isEmpty()) - emitError(XMLErrs::NoRootElemInDOCTYPE); - else - emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer()); - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Store the root element name for later check - setRootElemName(bbRootName.getRawBuffer()); - - // This element obviously is not going to exist in the element decl - // pool yet, but we need to call docTypeDecl. So force it into - // the element decl pool, marked as being there because it was in - // the DOCTYPE. Later, when its declared, the status will be updated. - // - // Only do this if we are not reusing the validator! If we are reusing, - // then look it up instead. It has to exist! - MemoryManager* const rootDeclMgr = - fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager; - - DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl - ( - bbRootName.getRawBuffer() - , fEmptyNamespaceId - , DTDElementDecl::Any - , rootDeclMgr - ); - - Janitor rootDeclJanitor(rootDecl); - rootDecl->setCreateReason(DTDElementDecl::AsRootElem); - rootDecl->setExternalElemDeclaration(true); - if(!fUseCachedGrammar) - { - fGrammar->putElemDecl(rootDecl); - rootDeclJanitor.release(); - } else - { - // put this in the undeclared pool so it gets deleted... - XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer()); - if (elemDecl) - { - rootDecl->setId(elemDecl->getId()); - } - else - { - rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl)); - rootDeclJanitor.release(); - } - } - - // Skip any spaces after the name - fReaderMgr.skipPastSpaces(); - - // And now if we are looking at a >, then we are done. It is not - // required to have an internal or external subset, though why you - // would not escapes me. - if (fReaderMgr.skippedChar(chCloseAngle)) { - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) - fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false); - return; - } - - // either internal/external subset - if (fValScheme == Val_Auto && !fValidate) - fValidate = true; - - bool hasIntSubset = false; - bool hasExtSubset = false; - XMLCh* sysId = 0; - XMLCh* pubId = 0; - - DTDScanner dtdScanner - ( - (DTDGrammar*) fGrammar - , fDocTypeHandler - , fGrammarPoolMemoryManager - , fMemoryManager - ); - dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); - - // If the next character is '[' then we have no external subset cause - // there is no system id, just the opening character of the internal - // subset. Else, has to be an id. - // - // Just look at the next char, don't eat it. - if (fReaderMgr.peekNextChar() == chOpenSquare) - { - hasIntSubset = true; - } - else - { - // Indicate we have an external subset - hasExtSubset = true; - fHasNoDTD = false; - - // Get buffers for the ids - XMLBufBid bbPubId(&fBufMgr); - XMLBufBid bbSysId(&fBufMgr); - - // Get the external subset id - if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External)) - { - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Get copies of the ids we got - pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager); - sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager); - - // Skip spaces and check again for the opening of an internal subset - fReaderMgr.skipPastSpaces(); - - // Just look at the next char, don't eat it. - if (fReaderMgr.peekNextChar() == chOpenSquare) { - hasIntSubset = true; - } - } - - // Insure that the ids get cleaned up, if they got allocated - ArrayJanitor janSysId(sysId, fMemoryManager); - ArrayJanitor janPubId(pubId, fMemoryManager); - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) - fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset); - - // Ok, if we had an internal subset, we are just past the [ character - // and need to parse that first. - if (hasIntSubset) - { - // Eat the opening square bracket - fReaderMgr.getNextChar(); - - checkInternalDTD(hasExtSubset, sysId, pubId); - - // And try to scan the internal subset. If we fail, try to recover - // by skipping forward tot he close angle and returning. - if (!dtdScanner.scanInternalSubset()) - { - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Do a sanity check that some expanded PE did not propogate out of - // the doctype. This could happen if it was terminated early by bad - // syntax. - if (fReaderMgr.getReaderDepth() > 1) - { - emitError(XMLErrs::PEPropogated); - - // Ask the reader manager to pop back down to the main level - fReaderMgr.cleanStackBackTo(1); - } - - fReaderMgr.skipPastSpaces(); - } - - // And that should leave us at the closing > of the DOCTYPE line - if (!fReaderMgr.skippedChar(chCloseAngle)) - { - // Do a special check for the common scenario of an extra ] char at - // the end. This is easy to recover from. - if (fReaderMgr.skippedChar(chCloseSquare) - && fReaderMgr.skippedChar(chCloseAngle)) - { - emitError(XMLErrs::ExtraCloseSquare); - } - else - { - emitError(XMLErrs::UnterminatedDOCTYPE); - fReaderMgr.skipPastChar(chCloseAngle); - } - } - - // If we had an external subset, then we need to deal with that one - // next. If we are reusing the validator, then don't scan it. - if (hasExtSubset) { - - InputSource* srcUsed=0; - Janitor janSrc(srcUsed); - // If we had an internal subset and we're using the cached grammar, it - // means that the ignoreCachedDTD is set, so we ignore the cached - // grammar - if (fUseCachedGrammar && !hasIntSubset) - { - srcUsed = resolveSystemId(sysId, pubId); - if (srcUsed) { - janSrc.reset(srcUsed); - Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId()); - - if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) { - - fDTDGrammar = (DTDGrammar*) grammar; - fGrammar = fDTDGrammar; - fValidator->setGrammar(fGrammar); - // If we don't report at least the external subset boundaries, - // an advanced document handler cannot know when the DTD end, - // since we've already sent a doctype decl that indicates there's - // there's an external subset. - if (fDocTypeHandler) - { - fDocTypeHandler->startExtSubset(); - fDocTypeHandler->endExtSubset(); - } - - return; - } - } - } - - if (fLoadExternalDTD || fValidate) - { - // And now create a reader to read this entity - XMLReader* reader; - if(srcUsed) { - reader = fReaderMgr.createReader - ( - *srcUsed - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , fCalculateSrcOfs - , fLowWaterMark - ); - } - else { - reader = fReaderMgr.createReader - ( - sysId - , pubId - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , srcUsed - , fCalculateSrcOfs - , fLowWaterMark - , fDisableDefaultEntityResolution - ); - janSrc.reset(srcUsed); - } - // If it failed then throw an exception - if (!reader) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager); - - if (fToCacheGrammar) { - - unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId()); - const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId); - - fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); - ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); - fGrammarResolver->putGrammar(fGrammar); - } - - // In order to make the processing work consistently, we have to - // make this look like an external entity. So create an entity - // decl and fill it in and push it with the reader, as happens - // with an external entity. Put a janitor on it to insure it gets - // cleaned up. The reader manager does not adopt them. - const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; - DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); - declDTD->setSystemId(sysId); - declDTD->setIsExternal(true); - - // Mark this one as a throw at end - reader->setThrowAtEnd(true); - - // And push it onto the stack, with its pseudo name - fReaderMgr.pushReader(reader, declDTD); - - // Tell it its not in an include section - dtdScanner.scanExtSubsetDecl(false, true); - } - } -} - -bool DGXMLScanner::scanStartTag(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the root and its empty. - gotData = true; - - // Get the QName. In this case, we are not doing namespaces, so we just - // use it as is and don't have to break it into parts. - - bool validName = fReaderMgr.getName(fQNameBuf); - if (!validName) - { - if (fQNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedElementName); - else - emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); - fReaderMgr.skipToChar(chOpenAngle); - return false; - } - - // Assume it won't be an empty tag - bool isEmpty = false; - - // See if its the root element - const bool isRoot = fElemStack.isEmpty(); - - // Lets try to look up the element in the validator's element decl pool - // We can pass bogus values for the URI id and the base name. We know that - // this can only be called if we are doing a DTD style validator and that - // he will only look at the QName. - // - // We *do not* tell him to fault in a decl if he does not find one - NG. - bool wasAdded = false; - const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); - - XMLElementDecl* elemDecl = fGrammar->getElemDecl - ( - fEmptyNamespaceId - , 0 - , qnameRawBuf - , Grammar::TOP_LEVEL_SCOPE - ); - // look in the undeclared pool: - if(!elemDecl) - { - elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); - } - if(!elemDecl) - { - wasAdded = true; - elemDecl = new (fMemoryManager) DTDElementDecl - ( - qnameRawBuf - , fEmptyNamespaceId - , DTDElementDecl::Any - , fMemoryManager - ); - elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); - } - - if (fValidate) { - - if (wasAdded) - { - // This is to tell the reuse Validator that this element was - // faulted-in, was not an element in the validator pool originally - elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); - - fValidator->emitError - ( - XMLValid::ElementNotDefined - , qnameRawBuf - ); - } - // If its not marked declared, then emit an error - else if (!elemDecl->isDeclared()) - { - fValidator->emitError - ( - XMLValid::ElementNotDefined - , qnameRawBuf - ); - } - - - fValidator->validateElement(elemDecl); - } - - // Expand the element stack and add the new element - fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); - - // If this is the first element and we are validating, check the root - // element. - if (isRoot) - { - fRootGrammar = fGrammar; - - if (fValidate) - { - // If a DocType exists, then check if it matches the root name there. - if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) - fValidator->emitError(XMLValid::RootElemNotLikeDocType); - } - } - else if (fValidate) - { - // If the element stack is not empty, then add this element as a - // child of the previous top element. If its empty, this is the root - // elem and is not the child of anything. - fElemStack.addChild(elemDecl->getElementName(), true); - } - - // Skip any whitespace after the name - fReaderMgr.skipPastSpaces(); - - // We loop until we either see a /> or >, handling attribute/value - // pairs until we get there. - XMLSize_t attCount = 0; - XMLSize_t curAttListSize = fAttrList->size(); - wasAdded = false; - - fElemCount++; - - while (true) - { - // And get the next non-space character - XMLCh nextCh = fReaderMgr.peekNextChar(); - - // If the next character is not a slash or closed angle bracket, - // then it must be whitespace, since whitespace is required - // between the end of the last attribute and the name of the next - // one. - if (attCount) - { - if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) - { - if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) - { - // Ok, skip by them and peek another char - fReaderMgr.skipPastSpaces(); - nextCh = fReaderMgr.peekNextChar(); - } - else - { - // Emit the error but keep on going - emitError(XMLErrs::ExpectedWhitespace); - } - } - } - - // Ok, here we first check for any of the special case characters. - // If its not one, then we do the normal case processing, which - // assumes that we've hit an attribute value, Otherwise, we do all - // the special case checks. - if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) - { - // Assume its going to be an attribute, so get a name from - // the input. - - validName = fReaderMgr.getName(fAttNameBuf); - if (!validName) - { - if (fAttNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedAttrName); - else - emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); - fReaderMgr.skipPastChar(chCloseAngle); - return false; - } - - // And next must be an equal sign - if (!scanEq()) - { - static const XMLCh tmpList[] = - { - chSingleQuote, chDoubleQuote, chCloseAngle - , chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedEqSign); - - // Try to sync back up by skipping forward until we either - // hit something meaningful. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) - { - // Jump back to top for normal processing of these - continue; - } - else if ((chFound == chSingleQuote) - || (chFound == chDoubleQuote) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through assuming that the value is to follow - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - - // See if this attribute is declared for this element. If we are - // not validating of course it will not be at first, but we will - // fault it into the pool (to avoid lots of redundant errors.) - XMLCh * namePtr = fAttNameBuf.getRawBuffer(); - XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); - - // Skip any whitespace before the value and then scan the att - // value. This will come back normalized with entity refs and - // char refs expanded. - fReaderMgr.skipPastSpaces(); - if (!scanAttValue(attDef, namePtr, fAttValueBuf)) - { - static const XMLCh tmpList[] = - { - chCloseAngle, chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedAttrValue); - - // It failed, so lets try to get synced back up. We skip - // forward until we find some whitespace or one of the - // chars in our list. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) - || (chFound == chForwardSlash) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through and process this attribute, though - // the value will be "". - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - - // Add this attribute to the attribute list that we use to - // pass them to the handler. We reuse its existing elements - // but expand it as required. - // Note that we want to this first since this will - // make a copy of the namePtr; we can then make use of - // that copy in the hashtable lookup that checks - // for duplicates. This will mean we may have to update - // the type of the XMLAttr later. - XMLAttr* curAtt; - const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); - - if (attCount >= curAttListSize) { - curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); - fAttrList->addElement(curAtt); - } - else { - curAtt = fAttrList->elementAt(attCount); - } - - curAtt->setSpecified(true); - - // NO NAMESPACE CODE - { - curAtt->set( - 0, namePtr, XMLUni::fgZeroLenString, XMLUni::fgZeroLenString - , (attDef)?attDef->getType():XMLAttDef::CData - ); - - // now need to prepare for duplicate detection - if (attDef) { - unsigned int *curCountPtr = fAttDefRegistry->get(attDef); - if (!curCountPtr) { - curCountPtr = getNewUIntPtr(); - *curCountPtr = fElemCount; - fAttDefRegistry->put(attDef, curCountPtr); - } - else if (*curCountPtr < fElemCount) { - *curCountPtr = fElemCount; - } - else { - emitError( - XMLErrs::AttrAlreadyUsedInSTag - , attDef->getFullName(), elemDecl->getFullName() - ); - } - } - else - { - // reset namePtr so it refers to newly-allocated memory - namePtr = (XMLCh *)curAtt->getQName(); - if (!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0)) - { - emitError( - XMLErrs::AttrAlreadyUsedInSTag - , namePtr, elemDecl->getFullName() - ); - } - } - } - - if (fValidate) - { - if (attDef) { - // Let the validator pass judgement on the attribute value - fValidator->validateAttrValue( - attDef, fAttValueBuf.getRawBuffer(), false, elemDecl - ); - } - else - { - fValidator->emitError - ( - XMLValid::AttNotDefinedForElement - , fAttNameBuf.getRawBuffer(), qnameRawBuf - ); - } - } - - // must set the newly-minted value on the XMLAttr: - curAtt->setValue(attrValue); - attCount++; - - // And jump back to the top of the loop - continue; - } - - // It was some special case character so do all of the checks and - // deal with it. - if (!nextCh) - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - - if (nextCh == chForwardSlash) - { - fReaderMgr.getNextChar(); - isEmpty = true; - if (!fReaderMgr.skippedChar(chCloseAngle)) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if (nextCh == chCloseAngle) - { - fReaderMgr.getNextChar(); - break; - } - else if (nextCh == chOpenAngle) - { - // Check for this one specially, since its going to be common - // and it is kind of auto-recovering since we've already hit the - // next open bracket, which is what we would have seeked to (and - // skipped this whole tag.) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) - { - // Check for this one specially, which is probably a missing - // attribute name, e.g. ="value". Just issue expected name - // error and eat the quoted string, then jump back to the - // top again. - emitError(XMLErrs::ExpectedAttrName); - fReaderMgr.getNextChar(); - fReaderMgr.skipQuotedString(nextCh); - fReaderMgr.skipPastSpaces(); - continue; - } - } - - if(attCount) - { - // clean up after ourselves: - // clear the map used to detect duplicate attributes - fUndeclaredAttrRegistry->removeAll(); - } - - // Now lets get the fAttrList filled in. This involves faulting in any - // defaulted and fixed attributes and normalizing the values of any that - // we got explicitly. - // - // We update the attCount value with the total number of attributes, but - // it goes in with the number of values we got during the raw scan of - // explictly provided attrs above. - attCount = buildAttList(attCount, elemDecl, *fAttrList); - - // If we have a document handler, then tell it about this start tag. We - // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send - // any prefix since its just one big name if we are not doing namespaces. - unsigned int uriId = fEmptyNamespaceId; - if (fDocHandler) - { - fDocHandler->startElement - ( - *elemDecl - , uriId - , 0 - , *fAttrList - , attCount - , isEmpty - , isRoot - ); - } - - // If empty, validate content right now if we are validating and then - // pop the element stack top. Else, we have to update the current stack - // top's namespace mapping elements. - if (isEmpty) - { - // If validating, then insure that its legal to have no content - if (fValidate) - { - XMLSize_t failure; - bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); - if (!res) - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , qnameRawBuf - , elemDecl->getFormattedContentModel() - ); - } - } - - // Pop the element stack back off since it'll never be used now - fElemStack.popTop(); - - // If the elem stack is empty, then it was an empty root - if (isRoot) - gotData = false; - } - - return true; -} - - -bool DGXMLScanner::scanStartTagNS(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the root and its empty. - gotData = true; - - // Get the QName. In this case, we are not doing namespaces, so we just - // use it as is and don't have to break it into parts. - - int colonPosition; - bool validName = fReaderMgr.getQName(fQNameBuf, &colonPosition); - if (!validName) - { - if (fQNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedElementName); - else - emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); - fReaderMgr.skipToChar(chOpenAngle); - return false; - } - - // Assume it won't be an empty tag - bool isEmpty = false; - - // See if its the root element - const bool isRoot = fElemStack.isEmpty(); - - // Lets try to look up the element in the validator's element decl pool - // We can pass bogus values for the URI id and the base name. We know that - // this can only be called if we are doing a DTD style validator and that - // he will only look at the QName. - // - // We *do not* tell him to fault in a decl if he does not find one - NG. - bool wasAdded = false; - const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); - - XMLElementDecl* elemDecl = fGrammar->getElemDecl - ( - fEmptyNamespaceId - , 0 - , qnameRawBuf - , Grammar::TOP_LEVEL_SCOPE - ); - // look in the undeclared pool: - if(!elemDecl) - { - elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); - } - if(!elemDecl) - { - wasAdded = true; - elemDecl = new (fMemoryManager) DTDElementDecl - ( - qnameRawBuf - , fEmptyNamespaceId - , DTDElementDecl::Any - , fMemoryManager - ); - elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); - } - - if (fValidate) { - - if (wasAdded) - { - // This is to tell the reuse Validator that this element was - // faulted-in, was not an element in the validator pool originally - elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); - - fValidator->emitError - ( - XMLValid::ElementNotDefined - , qnameRawBuf - ); - } - // If its not marked declared, then emit an error - else if (!elemDecl->isDeclared()) - { - fValidator->emitError - ( - XMLValid::ElementNotDefined - , qnameRawBuf - ); - } - - - fValidator->validateElement(elemDecl); - } - - // Expand the element stack and add the new element - fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); - - // If this is the first element and we are validating, check the root - // element. - if (isRoot) - { - fRootGrammar = fGrammar; - - if (fValidate) - { - // If a DocType exists, then check if it matches the root name there. - if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) - fValidator->emitError(XMLValid::RootElemNotLikeDocType); - } - } - else if (fValidate) - { - // If the element stack is not empty, then add this element as a - // child of the previous top element. If its empty, this is the root - // elem and is not the child of anything. - fElemStack.addChild(elemDecl->getElementName(), true); - } - - // Skip any whitespace after the name - fReaderMgr.skipPastSpaces(); - - // We loop until we either see a /> or >, handling attribute/value - // pairs until we get there. - XMLSize_t attCount = 0; - XMLSize_t curAttListSize = fAttrList->size(); - wasAdded = false; - - fElemCount++; - - while (true) - { - // And get the next non-space character - XMLCh nextCh = fReaderMgr.peekNextChar(); - - // If the next character is not a slash or closed angle bracket, - // then it must be whitespace, since whitespace is required - // between the end of the last attribute and the name of the next - // one. - if (attCount) - { - if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) - { - if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) - { - // Ok, skip by them and peek another char - fReaderMgr.skipPastSpaces(); - nextCh = fReaderMgr.peekNextChar(); - } - else - { - // Emit the error but keep on going - emitError(XMLErrs::ExpectedWhitespace); - } - } - } - - // Ok, here we first check for any of the special case characters. - // If its not one, then we do the normal case processing, which - // assumes that we've hit an attribute value, Otherwise, we do all - // the special case checks. - if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) - { - // Assume its going to be an attribute, so get a name from - // the input. - - validName = fReaderMgr.getQName(fAttNameBuf, &colonPosition); - if (!validName) - { - if (fAttNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedAttrName); - else - emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); - fReaderMgr.skipPastChar(chCloseAngle); - return false; - } - - // And next must be an equal sign - if (!scanEq()) - { - static const XMLCh tmpList[] = - { - chSingleQuote, chDoubleQuote, chCloseAngle - , chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedEqSign); - - // Try to sync back up by skipping forward until we either - // hit something meaningful. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) - { - // Jump back to top for normal processing of these - continue; - } - else if ((chFound == chSingleQuote) - || (chFound == chDoubleQuote) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through assuming that the value is to follow - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - - // See if this attribute is declared for this element. If we are - // not validating of course it will not be at first, but we will - // fault it into the pool (to avoid lots of redundant errors.) - XMLCh * namePtr = fAttNameBuf.getRawBuffer(); - XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); - - // Skip any whitespace before the value and then scan the att - // value. This will come back normalized with entity refs and - // char refs expanded. - fReaderMgr.skipPastSpaces(); - if (!scanAttValue(attDef, namePtr, fAttValueBuf)) - { - static const XMLCh tmpList[] = - { - chCloseAngle, chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedAttrValue); - - // It failed, so lets try to get synced back up. We skip - // forward until we find some whitespace or one of the - // chars in our list. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) - || (chFound == chForwardSlash) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through and process this attribute, though - // the value will be "". - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - - // Add this attribute to the attribute list that we use to - // pass them to the handler. We reuse its existing elements - // but expand it as required. - // Note that we want to this first since this will - // make a copy of the namePtr; we can then make use of - // that copy in the hashtable lookup that checks - // for duplicates. This will mean we may have to update - // the type of the XMLAttr later. - XMLAttr* curAtt; - const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); - - if (attCount >= curAttListSize) { - curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); - fAttrList->addElement(curAtt); - } - else { - curAtt = fAttrList->elementAt(attCount); - } - - curAtt->setSpecified(true); - // DO NAMESPACES - { - curAtt->set( - fEmptyNamespaceId, namePtr, XMLUni::fgZeroLenString - , (attDef)? attDef->getType() : XMLAttDef::CData - ); - - // each attribute has the prefix:suffix="value" - const XMLCh* attPrefix = curAtt->getPrefix(); - const XMLCh* attLocalName = curAtt->getName(); - - if (attPrefix && *attPrefix) { - if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) { - curAtt->setURIId(fXMLNamespaceId); - } - else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) { - curAtt->setURIId(fXMLNSNamespaceId); - updateNSMap(attPrefix, attLocalName, attrValue); - } - else { - fAttrNSList->addElement(curAtt); - } - } - else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) - { - updateNSMap(attPrefix, XMLUni::fgZeroLenString, attrValue); - } - - // NOTE: duplicate attribute check will be done, when we map - // namespaces to all attributes - if (attDef) { - unsigned int *curCountPtr = fAttDefRegistry->get(attDef); - if (!curCountPtr) { - curCountPtr = getNewUIntPtr(); - *curCountPtr = fElemCount; - fAttDefRegistry->put(attDef, curCountPtr); - } - else if (*curCountPtr < fElemCount) { - *curCountPtr = fElemCount; - } - } - } - - if (fValidate) - { - if (attDef) { - // Let the validator pass judgement on the attribute value - fValidator->validateAttrValue( - attDef, fAttValueBuf.getRawBuffer(), false, elemDecl - ); - } - else - { - fValidator->emitError - ( - XMLValid::AttNotDefinedForElement - , fAttNameBuf.getRawBuffer(), qnameRawBuf - ); - } - } - - // must set the newly-minted value on the XMLAttr: - curAtt->setValue(attrValue); - attCount++; - - // And jump back to the top of the loop - continue; - } - - // It was some special case character so do all of the checks and - // deal with it. - if (!nextCh) - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - - if (nextCh == chForwardSlash) - { - fReaderMgr.getNextChar(); - isEmpty = true; - if (!fReaderMgr.skippedChar(chCloseAngle)) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if (nextCh == chCloseAngle) - { - fReaderMgr.getNextChar(); - break; - } - else if (nextCh == chOpenAngle) - { - // Check for this one specially, since its going to be common - // and it is kind of auto-recovering since we've already hit the - // next open bracket, which is what we would have seeked to (and - // skipped this whole tag.) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) - { - // Check for this one specially, which is probably a missing - // attribute name, e.g. ="value". Just issue expected name - // error and eat the quoted string, then jump back to the - // top again. - emitError(XMLErrs::ExpectedAttrName); - fReaderMgr.getNextChar(); - fReaderMgr.skipQuotedString(nextCh); - fReaderMgr.skipPastSpaces(); - continue; - } - } - - // Make an initial pass through the list and find any xmlns attributes. - if (attCount) - scanAttrListforNameSpaces(fAttrList, attCount, elemDecl); - - if(attCount) - { - // clean up after ourselves: - // clear the map used to detect duplicate attributes - fUndeclaredAttrRegistry->removeAll(); - } - - // Now lets get the fAttrList filled in. This involves faulting in any - // defaulted and fixed attributes and normalizing the values of any that - // we got explicitly. - // - // We update the attCount value with the total number of attributes, but - // it goes in with the number of values we got during the raw scan of - // explictly provided attrs above. - attCount = buildAttList(attCount, elemDecl, *fAttrList); - - // If we have a document handler, then tell it about this start tag. We - // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send - // any prefix since its just one big name if we are not doing namespaces. - if (fDocHandler) - { - unsigned int uriId = resolvePrefix - ( - elemDecl->getElementName()->getPrefix() - , ElemStack::Mode_Element - ); - - fDocHandler->startElement - ( - *elemDecl - , uriId - , elemDecl->getElementName()->getPrefix() - , *fAttrList - , attCount - , isEmpty - , isRoot - ); - } - - // If empty, validate content right now if we are validating and then - // pop the element stack top. Else, we have to update the current stack - // top's namespace mapping elements. - if (isEmpty) - { - // If validating, then insure that its legal to have no content - if (fValidate) - { - XMLSize_t failure; - bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); - if (!res) - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , qnameRawBuf - , elemDecl->getFormattedContentModel() - ); - } - } - - // Pop the element stack back off since it'll never be used now - fElemStack.popTop(); - - // If the elem stack is empty, then it was an empty root - if (isRoot) - gotData = false; - } - - return true; -} - -// --------------------------------------------------------------------------- -// DGXMLScanner: Grammar preparsing -// --------------------------------------------------------------------------- -Grammar* DGXMLScanner::loadGrammar(const InputSource& src - , const short grammarType - , const bool toCache) -{ - Grammar* loadedGrammar = 0; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - try - { - fGrammarResolver->cacheGrammarFromParse(false); - fGrammarResolver->useCachedGrammarInParse(false); - fRootGrammar = 0; - - if (fValScheme == Val_Auto) { - fValidate = true; - } - - // Reset some status flags - fInException = false; - fStandalone = false; - fErrorCount = 0; - fHasNoDTD = true; - - if (grammarType == Grammar::DTDGrammarType) { - loadedGrammar = loadDTDGrammar(src, toCache); - } - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first failure' exception, so fall through - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so fall through - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - return loadedGrammar; -} - -Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src, - const bool toCache) -{ - // Reset the validators - fDTDValidator->reset(); - if (fValidatorFromUser) - fValidator->reset(); - - fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); - fGrammarResolver->putGrammar(fDTDGrammar); - fGrammar = fDTDGrammar; - fValidator->setGrammar(fGrammar); - - // And for all installed handlers, send reset events. This gives them - // a chance to flush any cached data. - if (fDocHandler) - fDocHandler->resetDocument(); - if (fEntityHandler) - fEntityHandler->resetEntities(); - if (fErrorReporter) - fErrorReporter->resetErrors(); - - // Clear out the id reference list - resetValidationContext(); - - if (toCache) { - - unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId()); - const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId); - - fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); - ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); - fGrammarResolver->putGrammar(fGrammar); - } - - // Handle the creation of the XML reader object for this input source. - // This will provide us with transcoding and basic lexing services. - XMLReader* newReader = fReaderMgr.createReader - ( - src - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , fCalculateSrcOfs - , fLowWaterMark - ); - if (!newReader) { - if (src.getIssueFatalErrorIfNotFound()) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); - else - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); - } - - // In order to make the processing work consistently, we have to - // make this look like an external entity. So create an entity - // decl and fill it in and push it with the reader, as happens - // with an external entity. Put a janitor on it to insure it gets - // cleaned up. The reader manager does not adopt them. - const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; - DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); - declDTD->setSystemId(src.getSystemId()); - declDTD->setIsExternal(true); - - // Mark this one as a throw at end - newReader->setThrowAtEnd(true); - - // And push it onto the stack, with its pseudo name - fReaderMgr.pushReader(newReader, declDTD); - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) { - - // Create a dummy root - DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl - ( - gDTDStr - , fEmptyNamespaceId - , DTDElementDecl::Any - , fGrammarPoolMemoryManager - ); - rootDecl->setCreateReason(DTDElementDecl::AsRootElem); - rootDecl->setExternalElemDeclaration(true); - Janitor janSrc(rootDecl); - - fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true); - } - - // Create DTDScanner - DTDScanner dtdScanner - ( - (DTDGrammar*)fGrammar - , fDocTypeHandler - , fGrammarPoolMemoryManager - , fMemoryManager - ); - dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); - - // Tell it its not in an include section - dtdScanner.scanExtSubsetDecl(false, true); - - if (fValidate) { - // validate the DTD scan so far - fValidator->preContentValidation(false, true); - } - - if (toCache) - fGrammarResolver->cacheGrammars(); - - return fDTDGrammar; -} - - -// --------------------------------------------------------------------------- -// DGXMLScanner: Private helper methods -// --------------------------------------------------------------------------- -// This method handles the common initialization, to avoid having to do -// it redundantly in multiple constructors. -void DGXMLScanner::commonInit() -{ - // And we need one for the raw attribute scan. This just stores key/ - // value string pairs (prior to any processing.) - fAttrNSList = new (fMemoryManager) ValueVectorOf(8, fMemoryManager); - - // Create the Validator and init them - fDTDValidator = new (fMemoryManager) DTDValidator(); - initValidator(fDTDValidator); - fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool(29, 128, fMemoryManager); - fAttDefRegistry = new (fMemoryManager) RefHashTableOf - ( - 131, false, fMemoryManager - ); - fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf(7, fMemoryManager); - - if (fValidator) - { - if (!fValidator->handlesDTD()) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); - } - else - { - fValidator = fDTDValidator; - } -} - -void DGXMLScanner::cleanUp() -{ - delete fAttrNSList; - delete fDTDValidator; - delete fDTDElemNonDeclPool; - delete fAttDefRegistry; - delete fUndeclaredAttrRegistry; -} - - -// This method is called from scanStartTagNS() to build up the list of -// XMLAttr objects that will be passed out in the start tag callout. We -// get the key/value pairs from the raw scan of explicitly provided attrs, -// which have not been normalized. And we get the element declaration from -// which we will get any defaulted or fixed attribute defs and add those -// in as well. -XMLSize_t -DGXMLScanner::buildAttList(const XMLSize_t attCount - , XMLElementDecl* elemDecl - , RefVectorOf& toFill) -{ - // Ask the element to clear the 'provided' flag on all of the att defs - // that it owns, and to return us a boolean indicating whether it has - // any defs. - const bool hasDefs = elemDecl->hasAttDefs(); - - // If there are no expliclitily provided attributes and there are no - // defined attributes for the element, the we don't have anything to do. - // So just return zero in this case. - if (!hasDefs && !attCount) - return 0; - - // Keep up with how many attrs we end up with total - XMLSize_t retCount = attCount; - - // And get the current size of the output vector. This lets us use - // existing elements until we fill it, then start adding new ones. - const XMLSize_t curAttListSize = toFill.size(); - - // Ok, so lets get an enumerator for the attributes of this element - // and run through them for well formedness and validity checks. But - // make sure that we had any attributes before we do it, since the list - // would have have gotten faulted in anyway. - if (hasDefs) - { - XMLAttDefList& attDefList = elemDecl->getAttDefList(); - for(XMLSize_t i=0; iget(&curDef); - if (!attCountPtr || *attCountPtr < fElemCount) - { // did not occur - const XMLAttDef::DefAttTypes defType = curDef.getDefaultType(); - - if (fValidate) - { - // If we are validating and its required, then an error - if (defType == XMLAttDef::Required) - { - fValidator->emitError - ( - XMLValid::RequiredAttrNotProvided - , curDef.getFullName() - ); - } - else if ((defType == XMLAttDef::Default) || - (defType == XMLAttDef::Fixed) ) - { - if (fStandalone && curDef.isExternal()) - { - // XML 1.0 Section 2.9 - // Document is standalone, so attributes must not be defaulted. - fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName()); - } - } - } - - // Fault in the value if needed, and bump the att count - if ((defType == XMLAttDef::Default) - || (defType == XMLAttDef::Fixed)) - { - // Let the validator pass judgement on the attribute value - if (fValidate) - { - fValidator->validateAttrValue - ( - &curDef - , curDef.getValue() - , false - , elemDecl - ); - } - - XMLAttr* curAtt; - if (retCount >= curAttListSize) - { - if (fDoNamespaces) - { - curAtt = new (fMemoryManager) XMLAttr - ( - fEmptyNamespaceId - , curDef.getFullName() - , curDef.getValue() - , curDef.getType() - , false - , fMemoryManager - ); - } - else - { - curAtt = new (fMemoryManager) XMLAttr - ( - 0 - , curDef.getFullName() - , XMLUni::fgZeroLenString - , curDef.getValue() - , curDef.getType() - , false - , fMemoryManager - ); - } - - fAttrList->addElement(curAtt); - } - else - { - curAtt = fAttrList->elementAt(retCount); - if (fDoNamespaces) - { - curAtt->set - ( - fEmptyNamespaceId - , curDef.getFullName() - , curDef.getValue() - , curDef.getType() - ); - } - else - { - curAtt->set - ( - 0 - , curDef.getFullName() - , XMLUni::fgZeroLenString - , curDef.getValue() - , curDef.getType() - ); - } - curAtt->setSpecified(false); - } - - if (fDoNamespaces) - { - // Map the new attribute's prefix to a URI id and store - // that in the attribute object. - const XMLCh* attPrefix = curAtt->getPrefix(); - if (attPrefix && *attPrefix) { - curAtt->setURIId - ( - resolvePrefix(attPrefix, ElemStack::Mode_Attribute) - ); - } - } - - retCount++; - } - } - } - } - - return retCount; -} - - -// This method will reset the scanner data structures, and related plugged -// in stuff, for a new scan session. We get the input source for the primary -// XML entity, create the reader for it, and push it on the stack so that -// upon successful return from here we are ready to go. -void DGXMLScanner::scanReset(const InputSource& src) -{ - - // This call implicitly tells us that we are going to reuse the scanner - // if it was previously used. So tell the validator to reset itself. - // - // But, if the fUseCacheGrammar flag is set, then don't reset it. - // - // NOTE: The ReaderMgr is flushed on the way out, because that is - // required to insure that files are closed. - fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar); - fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar); - - fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); - fGrammarResolver->putGrammar(fDTDGrammar); - fGrammar = fDTDGrammar; - fRootGrammar = 0; - fValidator->setGrammar(fGrammar); - - // Reset validation - fValidate = (fValScheme == Val_Always) ? true : false; - - // And for all installed handlers, send reset events. This gives them - // a chance to flush any cached data. - if (fDocHandler) - fDocHandler->resetDocument(); - if (fEntityHandler) - fEntityHandler->resetEntities(); - if (fErrorReporter) - fErrorReporter->resetErrors(); - - // Clear out the id reference list - resetValidationContext(); - - // Reset the Root Element Name - fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName; - fRootElemName = 0; - - // Reset the element stack, and give it the latest ids for the special - // URIs it has to know about. - fElemStack.reset - ( - fEmptyNamespaceId - , fUnknownNamespaceId - , fXMLNamespaceId - , fXMLNSNamespaceId - ); - - // Reset some status flags - fInException = false; - fStandalone = false; - fErrorCount = 0; - fHasNoDTD = true; - - // Reset the validators - fDTDValidator->reset(); - fDTDValidator->setErrorReporter(fErrorReporter); - if (fValidatorFromUser) - fValidator->reset(); - - // Handle the creation of the XML reader object for this input source. - // This will provide us with transcoding and basic lexing services. - XMLReader* newReader = fReaderMgr.createReader - ( - src - , true - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , fCalculateSrcOfs - , fLowWaterMark - ); - - if (!newReader) { - if (src.getIssueFatalErrorIfNotFound()) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); - else - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); - } - - // Push this read onto the reader manager - fReaderMgr.pushReader(newReader, 0); - - // and reset security-related things if necessary: - if(fSecurityManager != 0) - { - fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit(); - fEntityExpansionCount = 0; - } - if(fUIntPoolRowTotal >= 32) - { // 8 KB tied up with validating attributes... - fAttDefRegistry->removeAll(); - recreateUIntPool(); - } - else - { - // note that this will implicitly reset the values of the hashtables, - // though their buckets will still be tied up - resetUIntPool(); - } - fUndeclaredAttrRegistry->removeAll(); - fAttrNSList->removeAllElements(); -} - - -// This method is called between markup in content. It scans for character -// data that is sent to the document handler. It watches for any markup -// characters that would indicate that the character data has ended. It also -// handles expansion of general and character entities. -// -// sendData() is a local static helper for this method which handles some -// code that must be done in three different places here. -void DGXMLScanner::sendCharData(XMLBuffer& toSend) -{ - // If no data in the buffer, then nothing to do - if (toSend.isEmpty()) - return; - - // We do different things according to whether we are validating or - // not. If not, its always just characters; else, it depends on the - // current element's content model. - if (fValidate) - { - // Get the raw data we need for the callback - const XMLCh* const rawBuf = toSend.getRawBuffer(); - const XMLSize_t len = toSend.getLen(); - - // And see if the current element is a 'Children' style content model - const ElemStack::StackElem* topElem = fElemStack.topElement(); - - // Get the character data opts for the current element - XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); - - if (charOpts == XMLElementDecl::NoCharData) - { - // They definitely cannot handle any type of char data - fValidator->emitError(XMLValid::NoCharDataInCM); - } - else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len)) - { - // Its all spaces. So, if they can take spaces, then send it - // as ignorable whitespace. If they can handle any char data - // send it as characters. - if (charOpts == XMLElementDecl::SpacesOk) { - if (fDocHandler) - fDocHandler->ignorableWhitespace(rawBuf, len, false); - } - else if (charOpts == XMLElementDecl::AllCharData) - { - if (fDocHandler) - fDocHandler->docCharacters(rawBuf, len, false); - } - } - else - { - // If they can take any char data, then send it. Otherwise, they - // can only handle whitespace and can't handle this stuff so - // issue an error. - if (charOpts == XMLElementDecl::AllCharData) - { - if (fDocHandler) - fDocHandler->docCharacters(rawBuf, len, false); - } - else - { - fValidator->emitError(XMLValid::NoCharDataInCM); - } - } - } - else - { - // Always assume its just char data if not validating - if (fDocHandler) - fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false); - } - - // Reset buffer - toSend.reset(); -} - - - -// This method is called with a key/value string pair that represents an -// xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the -// current top of the element stack based on this data. We know that when -// we get here, that it is one of these forms, so we don't bother confirming -// it. -// -// But we have to ensure -// 1. xxx is not xmlns -// 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa -// 3. yyy is not XMLUni::fgXMLNSURIName -// 4. if xxx is not null, then yyy cannot be an empty string. -void DGXMLScanner::updateNSMap(const XMLCh* const attrPrefix - , const XMLCh* const attrLocalName - , const XMLCh* const attrValue) -{ - // We either have the default prefix (""), or we point it into the attr - // name parameter. Note that the xmlns is not the prefix we care about - // here. To us, the 'prefix' is really the local part of the attrName - // parameter. - // - // Check 1. xxx is not xmlns - // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa - // 3. yyy is not XMLUni::fgXMLNSURIName - // 4. if xxx is not null, then yyy cannot be an empty string. - if (attrPrefix && *attrPrefix) { - - if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString)) - emitError(XMLErrs::NoUseOfxmlnsAsPrefix); - else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) { - if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName)) - emitError(XMLErrs::PrefixXMLNotMatchXMLURI); - } - - if (!attrValue) - emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); - else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0) - emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); - } - - if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName)) - emitError(XMLErrs::NoUseOfxmlnsURI); - else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) { - if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString)) - emitError(XMLErrs::XMLURINotMatchXMLPrefix); - } - - // Ok, we have to get the unique id for the attribute value, which is the - // URI that this value should be mapped to. The validator has the - // namespace string pool, so we ask him to find or add this new one. Then - // we ask the element stack to add this prefix to URI Id mapping. - fElemStack.addPrefix - ( - attrLocalName - , fURIStringPool->addOrFind(attrValue) - ); -} - -void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf* theAttrList, XMLSize_t attCount, - XMLElementDecl* elemDecl) -{ - // Map prefixes to uris - for (XMLSize_t i=0; i < fAttrNSList->size(); i++) { - XMLAttr* providedAttr = fAttrNSList->elementAt(i); - providedAttr->setURIId( - resolvePrefix(providedAttr->getPrefix(), ElemStack::Mode_Attribute) - ); - } - - fAttrNSList->removeAllElements(); - - // Decide if to use hash table to do duplicate checking - bool toUseHashTable = false; - - setAttrDupChkRegistry(attCount, toUseHashTable); - for (XMLSize_t index = 0; index < attCount; index++) - { - // check for duplicate namespace attributes: - // by checking for qualified names with the same local part and with prefixes - // which have been bound to namespace names that are identical. - XMLAttr* curAttr = theAttrList->elementAt(index); - if (!toUseHashTable) - { - XMLAttr* loopAttr; - for (XMLSize_t attrIndex=0; attrIndex < index; attrIndex++) { - loopAttr = theAttrList->elementAt(attrIndex); - if (loopAttr->getURIId() == curAttr->getURIId() && - XMLString::equals(loopAttr->getName(), curAttr->getName())) { - emitError( - XMLErrs::AttrAlreadyUsedInSTag, curAttr->getName() - , elemDecl->getFullName() - ); - } - } - } - else - { - if (fAttrDupChkRegistry->containsKey((void*)curAttr->getName(), curAttr->getURIId())) - { - emitError( - XMLErrs::AttrAlreadyUsedInSTag - , curAttr->getName(), elemDecl->getFullName() - ); - } - - fAttrDupChkRegistry->put((void*)curAttr->getName(), curAttr->getURIId(), curAttr); - } - } -} - -InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId - ,const XMLCh* const pubId) -{ - //Normalize sysId - XMLBufBid nnSys(&fBufMgr); - XMLBuffer& normalizedSysId = nnSys.getBuffer(); - XMLString::removeChar(sysId, 0xFFFF, normalizedSysId); - const XMLCh* normalizedURI = normalizedSysId.getRawBuffer(); - - // Create a buffer for expanding the normalized system id - XMLBufBid bbSys(&fBufMgr); - XMLBuffer& expSysId = bbSys.getBuffer(); - - // Allow the entity handler to expand the system id if they choose - // to do so. - InputSource* srcToFill = 0; - if (fEntityHandler) - { - if (!fEntityHandler->expandSystemId(normalizedURI, expSysId)) - expSysId.set(normalizedURI); - - ReaderMgr::LastExtEntityInfo lastInfo; - fReaderMgr.getLastExtEntityInfo(lastInfo); - XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, - expSysId.getRawBuffer(), 0, pubId, lastInfo.systemId, - &fReaderMgr); - srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier); - } - else - { - expSysId.set(normalizedURI); - } - - // If they didn't create a source via the entity handler, then we - // have to create one on our own. - if (!srcToFill) - { - if (fDisableDefaultEntityResolution) - return srcToFill; - - ReaderMgr::LastExtEntityInfo lastInfo; - fReaderMgr.getLastExtEntityInfo(lastInfo); - - XMLURL urlTmp(fMemoryManager); - if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) || - (urlTmp.isRelative())) - { - if (!fStandardUriConformant) - { - XMLBufBid ddSys(&fBufMgr); - XMLBuffer& resolvedSysId = ddSys.getBuffer(); - XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId); - - srcToFill = new (fMemoryManager) LocalFileInputSource - ( - lastInfo.systemId - , resolvedSysId.getRawBuffer() - , fMemoryManager - ); - } - else - ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); - } - else - { - if (fStandardUriConformant && urlTmp.hasInvalidChar()) - ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); - srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); - } - } - - return srcToFill; -} - -// --------------------------------------------------------------------------- -// DGXMLScanner: Private parsing methods -// --------------------------------------------------------------------------- -bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef - , const XMLCh *const attrName - , XMLBuffer& toFill) -{ - enum States - { - InWhitespace - , InContent - }; - - // Get the type and name - const XMLAttDef::AttTypes type = (attDef) - ?attDef->getType() - :XMLAttDef::CData; - - // Reset the target buffer - toFill.reset(); - - // Get the next char which must be a single or double quote - XMLCh quoteCh; - if (!fReaderMgr.skipIfQuote(quoteCh)) - return false; - - // We have to get the current reader because we have to ignore closing - // quotes until we hit the same reader again. - const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); - - // check to see if it's a tokenized type that is declared externally - bool isAttTokenizedExternal = (attDef) - ?attDef->isExternal() && (type == XMLAttDef::ID || - type == XMLAttDef::IDRef || - type == XMLAttDef::IDRefs || - type == XMLAttDef::Entity || - type == XMLAttDef::Entities || - type == XMLAttDef::NmToken || - type == XMLAttDef::NmTokens) - :false; - - // Loop until we get the attribute value. Note that we use a double - // loop here to avoid the setup/teardown overhead of the exception - // handler on every round. - XMLCh nextCh; - XMLCh secondCh = 0; - States curState = InContent; - bool firstNonWS = false; - bool gotLeadingSurrogate = false; - bool escaped; - while (true) - { - try - { - while(true) - { - nextCh = fReaderMgr.getNextChar(); - - if (!nextCh) - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - - // Check for our ending quote in the same entity - if (nextCh == quoteCh) - { - if (curReader == fReaderMgr.getCurrentReaderNum()) - return true; - - // Watch for spillover into a previous entity - if (curReader > fReaderMgr.getCurrentReaderNum()) - { - emitError(XMLErrs::PartialMarkupInEntity); - return false; - } - } - - // Check for an entity ref now, before we let it affect our - // whitespace normalization logic below. We ignore the empty flag - // in this one. - escaped = false; - if (nextCh == chAmpersand) - { - if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned) - { - gotLeadingSurrogate = false; - continue; - } - } - else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) - { - // Deal with surrogate pairs - // Its a leading surrogate. If we already got one, then - // issue an error, else set leading flag to make sure that - // we look for a trailing next time. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - else - gotLeadingSurrogate = true; - } - else - { - // If its a trailing surrogate, make sure that we are - // prepared for that. Else, its just a regular char so make - // sure that we were not expected a trailing surrogate. - if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) - { - // Its trailing, so make sure we were expecting it - if (!gotLeadingSurrogate) - emitError(XMLErrs::Unexpected2ndSurrogateChar); - } - else - { - // Its just a char, so make sure we were not expecting a - // trailing surrogate. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - - // Its got to at least be a valid XML character - if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - , fMemoryManager - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } - } - gotLeadingSurrogate = false; - } - - // If its not escaped, then make sure its not a < character, which - // is not allowed in attribute values. - if (!escaped && (nextCh == chOpenAngle)) - emitError(XMLErrs::BracketInAttrValue, attrName); - - // If the attribute is a CDATA type we do simple replacement of - // tabs and new lines with spaces, if the character is not escaped - // by way of a char ref. - // - // Otherwise, we do the standard non-CDATA normalization of - // compressing whitespace to single spaces and getting rid of leading - // and trailing whitespace. - if (type == XMLAttDef::CData) - { - if (!escaped) - { - if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D)) - { - // Check Validity Constraint for Standalone document declaration - // XML 1.0, Section 2.9 - if (fStandalone && fValidate && isAttTokenizedExternal) - { - // Can't have a standalone document declaration of "yes" if attribute - // values are subject to normalisation - fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); - } - nextCh = chSpace; - } - } - } - else - { - if (curState == InWhitespace) - { - if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) - { - if (firstNonWS) - toFill.append(chSpace); - curState = InContent; - firstNonWS = true; - } - else - { - continue; - } - } - else if (curState == InContent) - { - if ((nextCh == chSpace) || - (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped)) - { - curState = InWhitespace; - - // Check Validity Constraint for Standalone document declaration - // XML 1.0, Section 2.9 - if (fStandalone && fValidate && isAttTokenizedExternal) - { - if (!firstNonWS || (nextCh != chSpace && fReaderMgr.lookingAtSpace())) - { - // Can't have a standalone document declaration of "yes" if attribute - // values are subject to normalisation - fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); - } - } - continue; - } - firstNonWS = true; - } - } - - // Else add it to the buffer - toFill.append(nextCh); - - if (secondCh) - { - toFill.append(secondCh); - secondCh=0; - } - } - } - catch(const EndOfEntityException&) - { - // Just eat it and continue. - gotLeadingSurrogate = false; - escaped = false; - } - } - return true; -} - - -// This method scans a CDATA section. It collects the character into one -// of the temp buffers and calls the document handler, if any, with the -// characters. It assumes that the fThisElement->getCharDataOpts(); - - while (true) - { - const XMLCh nextCh = fReaderMgr.getNextChar(); - - // Watch for unexpected end of file - if (!nextCh) - { - emitError(XMLErrs::UnterminatedCDATASection); - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - } - - if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))) - { - // This document is standalone; this ignorable CDATA whitespace is forbidden. - // XML 1.0, Section 2.9 - // And see if the current element is a 'Children' style content model - if (topElem->fThisElement->isExternal()) { - - if (charOpts == XMLElementDecl::SpacesOk) // Element Content - { - // Error - standalone should have a value of "no" as whitespace detected in an - // element type with element content whose element declaration was external - fValidator->emitError(XMLValid::NoWSForStandalone); - } - } - } - - // If this is a close square bracket it could be our closing - // sequence. - if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose)) - { - // make sure we were not expecting a trailing surrogate. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - - if (fValidate) { - - if (charOpts != XMLElementDecl::AllCharData) - { - // They definitely cannot handle any type of char data - fValidator->emitError(XMLValid::NoCharDataInCM); - } - } - - // If we have a doc handler, call it - if (fDocHandler) - { - fDocHandler->docCharacters - ( - bbCData.getRawBuffer() - , bbCData.getLen() - , true - ); - } - - // And we are done - break; - } - - // Make sure its a valid character. But if we've emitted an error - // already, don't bother with the overhead since we've already told - // them about it. - if (!emittedError) - { - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) - { - // Its a leading surrogate. If we already got one, then - // issue an error, else set leading flag to make sure that - // we look for a trailing next time. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - else - gotLeadingSurrogate = true; - } - else - { - // If its a trailing surrogate, make sure that we are - // prepared for that. Else, its just a regular char so make - // sure that we were not expected a trailing surrogate. - if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) - { - // Its trailing, so make sure we were expecting it - if (!gotLeadingSurrogate) - emitError(XMLErrs::Unexpected2ndSurrogateChar); - } - else - { - // Its just a char, so make sure we were not expecting a - // trailing surrogate. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - - // Its got to at least be a valid XML character - else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - , fMemoryManager - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - emittedError = true; - } - } - gotLeadingSurrogate = false; - } - } - - // Add it to the buffer - bbCData.append(nextCh); - } -} - - -void DGXMLScanner::scanCharData(XMLBuffer& toUse) -{ - // We have to watch for the stupid ]]> sequence, which is illegal in - // character data. So this is a little state machine that handles that. - enum States - { - State_Waiting - , State_GotOne - , State_GotTwo - }; - - // Reset the buffer before we start - toUse.reset(); - - // Turn on the 'throw at end' flag of the reader manager - ThrowEOEJanitor jan(&fReaderMgr, true); - - // In order to be more efficient we have to use kind of a deeply nested - // set of blocks here. The outer block puts on a try and catches end of - // entity exceptions. The inner loop is the per-character loop. If we - // put the try inside the inner loop, it would work but would require - // the exception handling code setup/teardown code to be invoked for - // each character. - XMLCh nextCh; - XMLCh secondCh = 0; - States curState = State_Waiting; - bool escaped = false; - bool gotLeadingSurrogate = false; - bool notDone = true; - while (notDone) - { - try - { - while (true) - { - // Eat through as many plain content characters as possible without - // needing special handling. Moving most content characters here, - // in this one call, rather than running the overall loop once - // per content character, is a speed optimization. - if (curState == State_Waiting && !gotLeadingSurrogate) - { - fReaderMgr.movePlainContentChars(toUse); - } - - // Try to get another char from the source - // The code from here on down covers all contengencies, - if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) - { - // If we were waiting for a trailing surrogate, its an error - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - - notDone = false; - break; - } - - // Watch for a reference. Note that the escapement mechanism - // is ignored in this content. - escaped = false; - if (nextCh == chAmpersand) - { - sendCharData(toUse); - - // Turn off the throwing at the end of entity during this - ThrowEOEJanitor jan(&fReaderMgr, false); - - if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned) - { - gotLeadingSurrogate = false; - continue; - } - else - { - if (escaped && !fElemStack.isEmpty()) - fElemStack.setReferenceEscaped(); - } - } - else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) - { - // Deal with surrogate pairs - // Its a leading surrogate. If we already got one, then - // issue an error, else set leading flag to make sure that - // we look for a trailing next time. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - else - gotLeadingSurrogate = true; - } - else - { - // If its a trailing surrogate, make sure that we are - // prepared for that. Else, its just a regular char so make - // sure that we were not expected a trailing surrogate. - if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) - { - // Its trailing, so make sure we were expecting it - if (!gotLeadingSurrogate) - emitError(XMLErrs::Unexpected2ndSurrogateChar); - } - else - { - // Its just a char, so make sure we were not expecting a - // trailing surrogate. - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); - - // Make sure the returned char is a valid XML char - if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - , fMemoryManager - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - } - } - gotLeadingSurrogate = false; - } - - // Keep the state machine up to date - if (!escaped) - { - if (nextCh == chCloseSquare) - { - if (curState == State_Waiting) - curState = State_GotOne; - else if (curState == State_GotOne) - curState = State_GotTwo; - } - else if (nextCh == chCloseAngle) - { - if (curState == State_GotTwo) - emitError(XMLErrs::BadSequenceInCharData); - curState = State_Waiting; - } - else - { - curState = State_Waiting; - } - } - else - { - curState = State_Waiting; - } - - // Add this char to the buffer - toUse.append(nextCh); - - if (secondCh) - { - toUse.append(secondCh); - secondCh=0; - } - } - } - catch(const EndOfEntityException& toCatch) - { - // Some entity ended, so we have to send any accumulated - // chars and send an end of entity event. - sendCharData(toUse); - gotLeadingSurrogate = false; - - if (fDocHandler) - fDocHandler->endEntityReference(toCatch.getEntity()); - } - } - - // Check the validity constraints as per XML 1.0 Section 2.9 - if (fValidate && fStandalone) - { - // See if the text contains whitespace - // Get the raw data we need for the callback - const XMLCh* rawBuf = toUse.getRawBuffer(); - const XMLSize_t len = toUse.getLen(); - const bool isSpaces = fReaderMgr.getCurrentReader()->containsWhiteSpace(rawBuf, len); - - if (isSpaces) - { - // And see if the current element is a 'Children' style content model - const ElemStack::StackElem* topElem = fElemStack.topElement(); - - if (topElem->fThisElement->isExternal()) { - - // Get the character data opts for the current element - XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); - - if (charOpts == XMLElementDecl::SpacesOk) // => Element Content - { - // Error - standalone should have a value of "no" as whitespace detected in an - // element type with element content whose element declaration was external - // - fValidator->emitError(XMLValid::NoWSForStandalone); - } - } - } - } - // Send any char data that we accumulated into the buffer - sendCharData(toUse); -} - - -// This method will scan a general/character entity ref. It will either -// expand a char ref and return it directly, or push a reader for a general -// entity. -// -// The return value indicates whether the char parameters hold the value -// or whether the value was pushed as a reader, or that it failed. -// -// The escaped flag tells the caller whether the returned parameter resulted -// from a character reference, which escapes the character in some cases. It -// only makes any difference if the return value indicates the value was -// returned directly. -DGXMLScanner::EntityExpRes -DGXMLScanner::scanEntityRef( const bool inAttVal - , XMLCh& firstCh - , XMLCh& secondCh - , bool& escaped) -{ - // Assume no escape - secondCh = 0; - escaped = false; - - // We have to insure that its all in one entity - const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); - - // If the next char is a pound, then its a character reference and we - // need to expand it always. - if (fReaderMgr.skippedChar(chPound)) - { - // Its a character reference, so scan it and get back the numeric - // value it represents. - if (!scanCharRef(firstCh, secondCh)) - return EntityExp_Failed; - - escaped = true; - - if (curReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - return EntityExp_Returned; - } - - // Expand it since its a normal entity ref - XMLBufBid bbName(&fBufMgr); - - int colonPosition; - bool validName = fDoNamespaces ? fReaderMgr.getQName(bbName.getBuffer(), &colonPosition) : - fReaderMgr.getName(bbName.getBuffer()); - if (!validName) - { - if (bbName.isEmpty()) - emitError(XMLErrs::ExpectedEntityRefName); - else - emitError(XMLErrs::InvalidEntityRefName, bbName.getRawBuffer()); - return EntityExp_Failed; - } - - // Next char must be a semi-colon. But if its not, just emit - // an error and try to continue. - if (!fReaderMgr.skippedChar(chSemiColon)) - emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); - - // Make sure we ended up on the same entity reader as the & char - if (curReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - // Look up the name in the general entity pool - XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer()); - - // If it does not exist, then obviously an error - if (!decl) - { - // XML 1.0 Section 4.1 - // Well-formedness Constraint for entity not found: - // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references, - // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset - // or a parameter entity - // - // Else it's Validity Constraint - if (fStandalone || fHasNoDTD) - emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); - else { - if (fValidate) - fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer()); - } - - return EntityExp_Failed; - } - - // XML 1.0 Section 4.1 - // If we are a standalone document, then it has to have been declared - // in the internal subset. - if (fStandalone && !decl->getDeclaredInIntSubset()) - emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer()); - - if (decl->isExternal()) - { - // If its unparsed, then its not valid here - if (decl->isUnparsed()) - { - emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer()); - return EntityExp_Failed; - } - - // If we are in an attribute value, then not valid but keep going - if (inAttVal) - emitError(XMLErrs::NoExtRefsInAttValue); - - // And now create a reader to read this entity - InputSource* srcUsed; - XMLReader* reader = fReaderMgr.createReader - ( - decl->getBaseURI() - , decl->getSystemId() - , decl->getPublicId() - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , srcUsed - , fCalculateSrcOfs - , fLowWaterMark - , fDisableDefaultEntityResolution - ); - - // Put a janitor on the source so it gets cleaned up on exit - Janitor janSrc(srcUsed); - - // If the creation failed, and its not because the source was empty, - // then emit an error and return. - if (!reader) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager); - - // Push the reader. If its a recursive expansion, then emit an error - // and return an failure. - if (!fReaderMgr.pushReader(reader, decl)) - { - emitError(XMLErrs::RecursiveEntity, decl->getName()); - return EntityExp_Failed; - } - - // here's where we need to check if there's a SecurityManager, - // how many entity references we've had - if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { - XMLCh expLimStr[32]; - XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); - emitError - ( - XMLErrs::EntityExpansionLimitExceeded - , expLimStr - ); - // there seems nothing better to do than reset the entity expansion counter - fEntityExpansionCount = 0; - } - - // Do a start entity reference event. - // - // For now, we supress them in att values. Later, when - // the stuff is in place to correctly allow DOM to handle them - // we'll turn this back on. - if (fDocHandler && !inAttVal) - fDocHandler->startEntityReference(*decl); - - // If it starts with the XML string, then parse a text decl - if (checkXMLDecl(true)) - scanXMLDecl(Decl_Text); - } - else - { - // If its one of the special char references, then we can return - // it as a character, and its considered escaped. - if (decl->getIsSpecialChar()) - { - firstCh = decl->getValue()[0]; - escaped = true; - return EntityExp_Returned; - } - - // Create a reader over a memory stream over the entity value - // We force it to assume UTF-16 by passing in an encoding - // string. This way it won't both trying to predecode the - // first line, looking for an XML/TextDecl. - XMLReader* valueReader = fReaderMgr.createIntEntReader - ( - decl->getName() - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , decl->getValue() - , decl->getValueLen() - , false - ); - - // Try to push the entity reader onto the reader manager stack, - // where it will become the subsequent input. If it fails, that - // means the entity is recursive, so issue an error. The reader - // will have just been discarded, but we just keep going. - if (!fReaderMgr.pushReader(valueReader, decl)) - emitError(XMLErrs::RecursiveEntity, decl->getName()); - - // here's where we need to check if there's a SecurityManager, - // how many entity references we've had - if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { - XMLCh expLimStr[32]; - XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); - emitError - ( - XMLErrs::EntityExpansionLimitExceeded - , expLimStr - ); - } - - // Do a start entity reference event. - // - // For now, we supress them in att values. Later, when - // the stuff is in place to correctly allow DOM to handle them - // we'll turn this back on. - if (fDocHandler && !inAttVal) - fDocHandler->startEntityReference(*decl); - - // If it starts with the XML string, then it's an error - if (checkXMLDecl(true)) { - emitError(XMLErrs::TextDeclNotLegalHere); - fReaderMgr.skipPastChar(chCloseAngle); - } - } - return EntityExp_Pushed; -} - - -} From 523adfdc97e7ff49754328c790d5def735406fed Mon Sep 17 00:00:00 2001 From: johnjamesmccann <98098904+johnjamesmccann@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:47:11 +0000 Subject: [PATCH 3/5] Delete IGXMLScanner.cpp --- IGXMLScanner.cpp | 3275 ---------------------------------------------- 1 file changed, 3275 deletions(-) delete mode 100644 IGXMLScanner.cpp diff --git a/IGXMLScanner.cpp b/IGXMLScanner.cpp deleted file mode 100644 index 5fe3da6f1..000000000 --- a/IGXMLScanner.cpp +++ /dev/null @@ -1,3275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * $Id$ - */ - -// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens -// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. - -// --------------------------------------------------------------------------- -// Includes -// --------------------------------------------------------------------------- -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace XERCES_CPP_NAMESPACE { - - -typedef JanitorMemFunCall CleanupType; -typedef JanitorMemFunCall ReaderMgrResetType; - - -// --------------------------------------------------------------------------- -// IGXMLScanner: Constructors and Destructor -// --------------------------------------------------------------------------- -IGXMLScanner::IGXMLScanner( XMLValidator* const valToAdopt - , GrammarResolver* const grammarResolver - , MemoryManager* const manager) : - - XMLScanner(valToAdopt, grammarResolver, manager) - , fSeeXsi(false) - , fGrammarType(Grammar::UnKnown) - , fElemStateSize(16) - , fElemState(0) - , fElemLoopState(0) - , fContent(1023, manager) - , fRawAttrList(0) - , fRawAttrColonListSize(32) - , fRawAttrColonList(0) - , fDTDValidator(0) - , fSchemaValidator(0) - , fDTDGrammar(0) - , fICHandler(0) - , fLocationPairs(0) - , fDTDElemNonDeclPool(0) - , fSchemaElemNonDeclPool(0) - , fElemCount(0) - , fAttDefRegistry(0) - , fUndeclaredAttrRegistry(0) - , fPSVIAttrList(0) - , fModel(0) - , fPSVIElement(0) - , fErrorStack(0) - , fSchemaInfoList(0) - , fCachedSchemaInfoList (0) -{ - CleanupType cleanup(this, &IGXMLScanner::cleanUp); - - try - { - commonInit(); - } - catch(const OutOfMemoryException&) - { - // Don't cleanup when out of memory, since executing the - // code can cause problems. - cleanup.release(); - - throw; - } - - cleanup.release(); -} - -IGXMLScanner::IGXMLScanner( XMLDocumentHandler* const docHandler - , DocTypeHandler* const docTypeHandler - , XMLEntityHandler* const entityHandler - , XMLErrorReporter* const errHandler - , XMLValidator* const valToAdopt - , GrammarResolver* const grammarResolver - , MemoryManager* const manager) : - - XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) - , fSeeXsi(false) - , fGrammarType(Grammar::UnKnown) - , fElemStateSize(16) - , fElemState(0) - , fElemLoopState(0) - , fContent(1023, manager) - , fRawAttrList(0) - , fRawAttrColonListSize(32) - , fRawAttrColonList(0) - , fDTDValidator(0) - , fSchemaValidator(0) - , fDTDGrammar(0) - , fICHandler(0) - , fLocationPairs(0) - , fDTDElemNonDeclPool(0) - , fSchemaElemNonDeclPool(0) - , fElemCount(0) - , fAttDefRegistry(0) - , fUndeclaredAttrRegistry(0) - , fPSVIAttrList(0) - , fModel(0) - , fPSVIElement(0) - , fErrorStack(0) - , fSchemaInfoList(0) - , fCachedSchemaInfoList (0) -{ - CleanupType cleanup(this, &IGXMLScanner::cleanUp); - - try - { - commonInit(); - } - catch(const OutOfMemoryException&) - { - // Don't cleanup when out of memory, since executing the - // code can cause problems. - cleanup.release(); - - throw; - } - - cleanup.release(); -} - -IGXMLScanner::~IGXMLScanner() -{ - cleanUp(); -} - -// --------------------------------------------------------------------------- -// XMLScanner: Getter methods -// --------------------------------------------------------------------------- -NameIdPool* IGXMLScanner::getEntityDeclPool() -{ - if(!fDTDGrammar) - return 0; - return fDTDGrammar->getEntityDeclPool(); -} - -const NameIdPool* IGXMLScanner::getEntityDeclPool() const -{ - if(!fDTDGrammar) - return 0; - return fDTDGrammar->getEntityDeclPool(); -} - -// --------------------------------------------------------------------------- -// IGXMLScanner: Main entry point to scan a document -// --------------------------------------------------------------------------- -void IGXMLScanner::scanDocument(const InputSource& src) -{ - // Bump up the sequence id for this parser instance. This will invalidate - // any previous progressive scan tokens. - fSequenceId++; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - try - { - // Reset the scanner and its plugged in stuff for a new run. This - // resets all the data structures, creates the initial reader and - // pushes it on the stack, and sets up the base document path. - scanReset(src); - - // If we have a document handler, then call the start document - if (fDocHandler) - fDocHandler->startDocument(); - - // Scan the prolog part, which is everything before the root element - // including the DTD subsets. - scanProlog(); - - // If we got to the end of input, then its not a valid XML file. - // Else, go on to scan the content. - if (fReaderMgr.atEOF()) - { - emitError(XMLErrs::EmptyMainEntity); - } - else - { - // Scan content, and tell it its not an external entity - if (scanContent()) - { - // Do post-parse validation if required - if (fValidate) - { - // We handle ID reference semantics at this level since - // its required by XML 1.0. - checkIDRefs(); - - // Then allow the validator to do any extra stuff it wants -// fValidator->postParseValidation(); - } - - // That went ok, so scan for any miscellaneous stuff - if (!fReaderMgr.atEOF()) - scanMiscellaneous(); - } - } - - // If we have a document handler, then call the end document - if (fDocHandler) - fDocHandler->endDocument(); - - //cargill debug: - //fGrammarResolver->getXSModel(); - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first failure' exception, so fall through - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so fall through - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } -} - - -bool IGXMLScanner::scanNext(XMLPScanToken& token) -{ - // Make sure this token is still legal - if (!isLegalToken(token)) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); - - // Find the next token and remember the reader id - XMLSize_t orgReader; - XMLTokens curToken; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - bool retVal = true; - - try - { - while (true) - { - // We have to handle any end of entity exceptions that happen here. - // We could be at the end of X nested entities, each of which will - // generate an end of entity exception as we try to move forward. - try - { - curToken = senseNextToken(orgReader); - break; - } - catch(const EndOfEntityException& toCatch) - { - // Send an end of entity reference event - if (fDocHandler) - fDocHandler->endEntityReference(toCatch.getEntity()); - } - } - - if (curToken == Token_CharData) - { - scanCharData(fCDataBuf); - } - else if (curToken == Token_EOF) - { - if (!fElemStack.isEmpty()) - { - const ElemStack::StackElem* topElem = fElemStack.popTop(); - emitError - ( - XMLErrs::EndedWithTagsOnStack - , topElem->fThisElement->getFullName() - ); - } - - retVal = false; - } - else - { - // Its some sort of markup - bool gotData = true; - switch(curToken) - { - case Token_CData : - // Make sure we are within content - if (fElemStack.isEmpty()) - emitError(XMLErrs::CDATAOutsideOfContent); - scanCDSection(); - break; - - case Token_Comment : - scanComment(); - break; - - case Token_EndTag : - scanEndTag(gotData); - break; - - case Token_PI : - scanPI(); - break; - - case Token_StartTag : - if (fDoNamespaces) - scanStartTagNS(gotData); - else - scanStartTag(gotData); - break; - - default : - fReaderMgr.skipToChar(chOpenAngle); - break; - } - - if (orgReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - // If we hit the end, then do the miscellaneous part - if (!gotData) - { - // Do post-parse validation if required - if (fValidate) - { - // We handle ID reference semantics at this level since - // its required by XML 1.0. - checkIDRefs(); - - // Then allow the validator to do any extra stuff it wants -// fValidator->postParseValidation(); - } - - // That went ok, so scan for any miscellaneous stuff - scanMiscellaneous(); - - if (toCheckIdentityConstraint()) - fICHandler->endDocument(); - - if (fDocHandler) - fDocHandler->endDocument(); - } - } - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first failure' exception so return failure - retVal = false; - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so return failure - retVal = false; - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - retVal = false; - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - // If we are not at the end, release the object that will - // reset the ReaderMgr. - if (retVal) - resetReaderMgr.release(); - - return retVal; -} - - - -// --------------------------------------------------------------------------- -// IGXMLScanner: Private helper methods. Most of these are implemented in -// IGXMLScanner2.Cpp. -// --------------------------------------------------------------------------- - -// This method handles the common initialization, to avoid having to do -// it redundantly in multiple constructors. -void IGXMLScanner::commonInit() -{ - - // Create the element state array - fElemState = (unsigned int*) fMemoryManager->allocate - ( - fElemStateSize * sizeof(unsigned int) - ); //new unsigned int[fElemStateSize]; - fElemLoopState = (unsigned int*) fMemoryManager->allocate - ( - fElemStateSize * sizeof(unsigned int) - ); //new unsigned int[fElemStateSize]; - - // And we need one for the raw attribute scan. This just stores key/ - // value string pairs (prior to any processing.) - fRawAttrList = new (fMemoryManager) RefVectorOf(32, true, fMemoryManager); - fRawAttrColonList = (int*) fMemoryManager->allocate - ( - fRawAttrColonListSize * sizeof(int) - ); - - // Create the Validator and init them - fDTDValidator = new (fMemoryManager) DTDValidator(); - initValidator(fDTDValidator); - fSchemaValidator = new (fMemoryManager) SchemaValidator(0, fMemoryManager); - initValidator(fSchemaValidator); - - // Create IdentityConstraint info - fICHandler = new (fMemoryManager) IdentityConstraintHandler(this, fMemoryManager); - - // Create schemaLocation pair info - fLocationPairs = new (fMemoryManager) ValueVectorOf(8, fMemoryManager); - // create pools for undeclared elements - fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool(29, 128, fMemoryManager); - fSchemaElemNonDeclPool = new (fMemoryManager) RefHash3KeysIdPool(29, true, 128, fMemoryManager); - fAttDefRegistry = new (fMemoryManager) RefHashTableOf - ( - 131, false, fMemoryManager - ); - fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf(7, fMemoryManager); - fPSVIAttrList = new (fMemoryManager) PSVIAttributeList(fMemoryManager); - - fSchemaInfoList = new (fMemoryManager) RefHash2KeysTableOf(29, fMemoryManager); - fCachedSchemaInfoList = new (fMemoryManager) RefHash2KeysTableOf(29, fMemoryManager); - - // use fDTDValidator as the default validator - if (!fValidator) - fValidator = fDTDValidator; -} - -void IGXMLScanner::cleanUp() -{ - fMemoryManager->deallocate(fElemState); //delete [] fElemState; - fMemoryManager->deallocate(fElemLoopState); //delete [] fElemLoopState; - delete fRawAttrList; - fMemoryManager->deallocate(fRawAttrColonList); - delete fDTDValidator; - delete fSchemaValidator; - delete fICHandler; - delete fLocationPairs; - delete fDTDElemNonDeclPool; - delete fSchemaElemNonDeclPool; - delete fAttDefRegistry; - delete fUndeclaredAttrRegistry; - delete fPSVIAttrList; - delete fPSVIElement; - delete fErrorStack; - delete fSchemaInfoList; - delete fCachedSchemaInfoList; -} - -// --------------------------------------------------------------------------- -// IGXMLScanner: Private scanning methods -// --------------------------------------------------------------------------- - -// This method is called from scanStartTag() to handle the very raw initial -// scan of the attributes. It just fills in the passed collection with -// key/value pairs for each attribute. No processing is done on them at all. -XMLSize_t -IGXMLScanner::rawAttrScan(const XMLCh* const elemName - , RefVectorOf& toFill - , bool& isEmpty) -{ - // Keep up with how many attributes we've seen so far, and how many - // elements are available in the vector. This way we can reuse old - // elements until we run out and then expand it. - XMLSize_t attCount = 0; - XMLSize_t curVecSize = toFill.size(); - - // Assume it is not empty - isEmpty = false; - - // We loop until we either see a /> or >, handling key/value pairs util - // we get there. We place them in the passed vector, which we will expand - // as required to hold them. - while (true) - { - // Get the next character, which should be non-space - XMLCh nextCh = fReaderMgr.peekNextChar(); - - // If the next character is not a slash or closed angle bracket, - // then it must be whitespace, since whitespace is required - // between the end of the last attribute and the name of the next - // one. - // - if (attCount) - { - if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) - { - bool bFoundSpace; - fReaderMgr.skipPastSpaces(bFoundSpace); - if (!bFoundSpace) - { - // Emit the error but keep on going - emitError(XMLErrs::ExpectedWhitespace); - } - // Ok, peek another char - nextCh = fReaderMgr.peekNextChar(); - } - } - - // Ok, here we first check for any of the special case characters. - // If its not one, then we do the normal case processing, which - // assumes that we've hit an attribute value, Otherwise, we do all - // the special case checks. - if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) - { - // Assume it's going to be an attribute, so get a name from - // the input. - int colonPosition; - if (!fReaderMgr.getQName(fAttNameBuf, &colonPosition)) - { - if (fAttNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedAttrName); - else - emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); - fReaderMgr.skipPastChar(chCloseAngle); - return attCount; - } - - const XMLCh* curAttNameBuf = fAttNameBuf.getRawBuffer(); - - // And next must be an equal sign - if (!scanEq()) - { - static const XMLCh tmpList[] = - { - chSingleQuote, chDoubleQuote, chCloseAngle - , chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedEqSign); - - // Try to sync back up by skipping forward until we either - // hit something meaningful. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) - { - // Jump back to top for normal processing of these - continue; - } - else if ((chFound == chSingleQuote) - || (chFound == chDoubleQuote) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through assuming that the value is to follow - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemName); - return attCount; - } - else - { - // Something went really wrong - return attCount; - } - } - - // Next should be the quoted attribute value. We just do a simple - // and stupid scan of this value. The only thing we do here - // is to expand entity references. - if (!basicAttrValueScan(curAttNameBuf, fAttValueBuf)) - { - static const XMLCh tmpList[] = - { - chCloseAngle, chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedAttrValue); - - // It failed, so lets try to get synced back up. We skip - // forward until we find some whitespace or one of the - // chars in our list. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) - || (chFound == chForwardSlash) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through and process this attribute, though - // the value will be "". - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemName); - return attCount; - } - else - { - // Something went really wrong - return attCount; - } - } - - // And now lets add it to the passed collection. If we have not - // filled it up yet, then we use the next element. Else we add - // a new one. - KVStringPair* curPair = 0; - if (attCount >= curVecSize) - { - curPair = new (fMemoryManager) KVStringPair - ( - curAttNameBuf - , fAttNameBuf.getLen() - , fAttValueBuf.getRawBuffer() - , fAttValueBuf.getLen() - , fMemoryManager - ); - toFill.addElement(curPair); - } - else - { - curPair = toFill.elementAt(attCount); - curPair->set - ( - curAttNameBuf, - fAttNameBuf.getLen(), - fAttValueBuf.getRawBuffer(), - fAttValueBuf.getLen() - ); - } - - if (attCount >= fRawAttrColonListSize) { - resizeRawAttrColonList(); - } - // Set the position of the colon and bump the count of attributes we've gotten - fRawAttrColonList[attCount++] = colonPosition; - - // And go to the top again for another attribute - continue; - } - - // It was some special case character so do all of the checks and - // deal with it. - if (!nextCh) - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - - if (nextCh == chForwardSlash) - { - fReaderMgr.getNextChar(); - isEmpty = true; - if (!fReaderMgr.skippedChar(chCloseAngle)) - emitError(XMLErrs::UnterminatedStartTag, elemName); - break; - } - else if (nextCh == chCloseAngle) - { - fReaderMgr.getNextChar(); - break; - } - else if (nextCh == chOpenAngle) - { - // Check for this one specially, since its going to be common - // and it is kind of auto-recovering since we've already hit the - // next open bracket, which is what we would have seeked to (and - // skipped this whole tag.) - emitError(XMLErrs::UnterminatedStartTag, elemName); - break; - } - else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) - { - // Check for this one specially, which is probably a missing - // attribute name, e.g. ="value". Just issue expected name - // error and eat the quoted string, then jump back to the - // top again. - emitError(XMLErrs::ExpectedAttrName); - fReaderMgr.getNextChar(); - fReaderMgr.skipQuotedString(nextCh); - fReaderMgr.skipPastSpaces(); - continue; - } - } - - return attCount; -} - - -// This method will kick off the scanning of the primary content of the -// document, i.e. the elements. -bool IGXMLScanner::scanContent() -{ - // Go into a loop until we hit the end of the root element, or we fall - // out because there is no root element. - // - // We have to do kind of a deeply nested double loop here in order to - // avoid doing the setup/teardown of the exception handler on each - // round. Doing it this way we only do it when an exception actually - // occurs. - bool gotData = true; - bool inMarkup = false; - while (gotData) - { - try - { - while (gotData) - { - // Sense what the next top level token is. According to what - // this tells us, we will call something to handle that kind - // of thing. - XMLSize_t orgReader; - const XMLTokens curToken = senseNextToken(orgReader); - - // Handle character data and end of file specially. Char data - // is not markup so we don't want to handle it in the loop - // below. - if (curToken == Token_CharData) - { - // Scan the character data and call appropriate events. Let - // him use our local character data buffer for efficiency. - scanCharData(fCDataBuf); - continue; - } - else if (curToken == Token_EOF) - { - // The element stack better be empty at this point or we - // ended prematurely before all elements were closed. - if (!fElemStack.isEmpty()) - { - const ElemStack::StackElem* topElem = fElemStack.popTop(); - emitError - ( - XMLErrs::EndedWithTagsOnStack - , topElem->fThisElement->getFullName() - ); - } - - // Its the end of file, so clear the got data flag - gotData = false; - continue; - } - - // We are in some sort of markup now - inMarkup = true; - - // According to the token we got, call the appropriate - // scanning method. - switch(curToken) - { - case Token_CData : - // Make sure we are within content - if (fElemStack.isEmpty()) - emitError(XMLErrs::CDATAOutsideOfContent); - scanCDSection(); - break; - - case Token_Comment : - scanComment(); - break; - - case Token_EndTag : - scanEndTag(gotData); - break; - - case Token_PI : - scanPI(); - break; - - case Token_StartTag : - if (fDoNamespaces) - scanStartTagNS(gotData); - else - scanStartTag(gotData); - break; - - default : - fReaderMgr.skipToChar(chOpenAngle); - break; - } - - if (orgReader != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialMarkupInEntity); - - // And we are back out of markup again - inMarkup = false; - } - } - catch(const EndOfEntityException& toCatch) - { - // If we were in some markup when this happened, then its a - // partial markup error. - if (inMarkup) - emitError(XMLErrs::PartialMarkupInEntity); - - // Send an end of entity reference event - if (fDocHandler) - fDocHandler->endEntityReference(toCatch.getEntity()); - - inMarkup = false; - } - } - - // It went ok, so return success - return true; -} - - -void IGXMLScanner::scanEndTag(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the end of the root element. - gotData = true; - - // Check if the element stack is empty. If so, then this is an unbalanced - // element (i.e. more ends than starts, perhaps because of bad text - // causing one to be skipped.) - if (fElemStack.isEmpty()) - { - emitError(XMLErrs::MoreEndThanStartTags); - fReaderMgr.skipPastChar(chCloseAngle); - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); - } - - // Pop the stack of the element we are supposed to be ending. Remember - // that we don't own this. The stack just keeps them and reuses them. - unsigned int uriId = (fDoNamespaces) - ? fElemStack.getCurrentURI() : fEmptyNamespaceId; - - // these get initialized below - const ElemStack::StackElem* topElem = 0; - const XMLCh *elemName = 0; - - // Make sure that its the end of the element that we expect - // special case for schema validation, whose element decls, - // obviously don't contain prefix information - if(fGrammarType == Grammar::SchemaGrammarType) - { - elemName = fElemStack.getCurrentSchemaElemName(); - topElem = fElemStack.topElement(); - } - else - { - topElem = fElemStack.topElement(); - elemName = topElem->fThisElement->getFullName(); - } - if (!fReaderMgr.skippedStringLong(elemName)) - { - emitError - ( - XMLErrs::ExpectedEndOfTagX - , elemName - ); - fReaderMgr.skipPastChar(chCloseAngle); - fElemStack.popTop(); - return; - } - - // Make sure we are back on the same reader as where we started - if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) - emitError(XMLErrs::PartialTagMarkupError); - - // Skip optional whitespace - fReaderMgr.skipPastSpaces(); - - // Make sure we find the closing bracket - if (!fReaderMgr.skippedChar(chCloseAngle)) - { - emitError - ( - XMLErrs::UnterminatedEndTag - , topElem->fThisElement->getFullName() - ); - } - - if (fGrammarType == Grammar::SchemaGrammarType) - { - // reset error occurred - fPSVIElemContext.fErrorOccurred = fErrorStack->pop(); - if (fValidate && topElem->fThisElement->isDeclared()) - { - fPSVIElemContext.fCurrentTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); - if(!fPSVIElemContext.fCurrentTypeInfo) - fPSVIElemContext.fCurrentDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); - else - fPSVIElemContext.fCurrentDV = 0; - if(fPSVIHandler) - { - fPSVIElemContext.fNormalizedValue = ((SchemaValidator*) fValidator)->getNormalizedValue(); - - if (XMLString::equals(fPSVIElemContext.fNormalizedValue, XMLUni::fgZeroLenString)) - fPSVIElemContext.fNormalizedValue = 0; - } - } - else - { - fPSVIElemContext.fCurrentDV = 0; - fPSVIElemContext.fCurrentTypeInfo = 0; - fPSVIElemContext.fNormalizedValue = 0; - } - } - - // If validation is enabled, then lets pass him the list of children and - // this element and let him validate it. - DatatypeValidator* psviMemberType = 0; - if (fValidate) - { - - // - // XML1.0-3rd - // Validity Constraint: - // The declaration matches EMPTY and the element has no content (not even - // entity references, comments, PIs or white space). - // - if ( (fGrammarType == Grammar::DTDGrammarType) && - (topElem->fCommentOrPISeen) && - (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty)) - { - fValidator->emitError - ( - XMLValid::EmptyElemHasContent - , topElem->fThisElement->getFullName() - ); - } - - // - // XML1.0-3rd - // Validity Constraint: - // - // The declaration matches children and the sequence of child elements - // belongs to the language generated by the regular expression in the - // content model, with optional white space, comments and PIs - // (i.e. markup matching production [27] Misc) between the start-tag and - // the first child element, between child elements, or between the last - // child element and the end-tag. - // - // Note that - // a CDATA section containing only white space or - // a reference to an entity whose replacement text is character references - // expanding to white space do not match the nonterminal S, and hence - // cannot appear in these positions; however, - // a reference to an internal entity with a literal value consisting - // of character references expanding to white space does match S, - // since its replacement text is the white space resulting from expansion - // of the character references. - // - if ( (fGrammarType == Grammar::DTDGrammarType) && - (topElem->fReferenceEscaped) && - (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children)) - { - fValidator->emitError - ( - XMLValid::ElemChildrenHasInvalidWS - , topElem->fThisElement->getFullName() - ); - } - XMLSize_t failure; - bool res = fValidator->checkContent - ( - topElem->fThisElement - , topElem->fChildren - , topElem->fChildCount - , &failure - ); - - if (!res) - { - // One of the elements is not valid for the content. NOTE that - // if no children were provided but the content model requires - // them, it comes back with a zero value. But we cannot use that - // to index the child array in this case, and have to put out a - // special message. - if (!topElem->fChildCount) - { - fValidator->emitError - ( - XMLValid::EmptyNotValidForContent - , topElem->fThisElement->getFormattedContentModel() - ); - } - else if (failure >= topElem->fChildCount) - { - fValidator->emitError - ( - XMLValid::NotEnoughElemsForCM - , topElem->fThisElement->getFormattedContentModel() - ); - } - else - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , topElem->fChildren[failure]->getRawName() - , topElem->fThisElement->getFormattedContentModel() - ); - } - } - - - if (fGrammarType == Grammar::SchemaGrammarType) { - if (((SchemaValidator*) fValidator)->getErrorOccurred()) - fPSVIElemContext.fErrorOccurred = true; - else if (fPSVIElemContext.fCurrentDV && fPSVIElemContext.fCurrentDV->getType() == DatatypeValidator::Union) - psviMemberType = fValidationContext->getValidatingMemberType(); - - if (fPSVIHandler) - { - fPSVIElemContext.fIsSpecified = ((SchemaValidator*) fValidator)->getIsElemSpecified(); - if(fPSVIElemContext.fIsSpecified) - fPSVIElemContext.fNormalizedValue = ((SchemaElementDecl *)topElem->fThisElement)->getDefaultValue(); - } - - // call matchers and de-activate context - if (toCheckIdentityConstraint()) - { - fICHandler->deactivateContext - ( - (SchemaElementDecl *) topElem->fThisElement - , fContent.getRawBuffer() - , fValidationContext - , fPSVIElemContext.fCurrentDV - ); - } - - } - } - - // QName dv needed topElem to resolve URIs on the checkContent - fElemStack.popTop(); - - // See if it was the root element, to avoid multiple calls below - const bool isRoot = fElemStack.isEmpty(); - - if (fGrammarType == Grammar::SchemaGrammarType) - { - if (fPSVIHandler) - { - endElementPSVI( - (SchemaElementDecl*)topElem->fThisElement, psviMemberType); - } - // now we can reset the datatype buffer, since the - // application has had a chance to copy the characters somewhere else - ((SchemaValidator *)fValidator)->clearDatatypeBuffer(); - } - - // If we have a doc handler, tell it about the end tag - if (fDocHandler) - { - if (fGrammarType == Grammar::SchemaGrammarType) { - if (topElem->fPrefixColonPos != -1) - fPrefixBuf.set(elemName, topElem->fPrefixColonPos); - else - fPrefixBuf.reset(); - } - else { - fPrefixBuf.set(topElem->fThisElement->getElementName()->getPrefix()); - } - fDocHandler->endElement - ( - *topElem->fThisElement - , uriId - , isRoot - , fPrefixBuf.getRawBuffer() - ); - } - - if (fGrammarType == Grammar::SchemaGrammarType) { - if (!isRoot) - { - // update error information - fErrorStack->push((fErrorStack->size() && fErrorStack->pop()) || fPSVIElemContext.fErrorOccurred); - - - } - } - - // If this was the root, then done with content - gotData = !isRoot; - - if (gotData) { - if (fDoNamespaces) { - // Restore the grammar - fGrammar = fElemStack.getCurrentGrammar(); - fGrammarType = fGrammar->getGrammarType(); - if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) { - if (fValidatorFromUser) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoSchemaValidator, fMemoryManager); - else { - fValidator = fSchemaValidator; - } - } - else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) { - if (fValidatorFromUser) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); - else { - fValidator = fDTDValidator; - } - } - - fValidator->setGrammar(fGrammar); - } - - // Restore the validation flag - fValidate = fElemStack.getValidationFlag(); - } -} - - -// This method handles the high level logic of scanning the DOCType -// declaration. This calls the DTDScanner and kicks off both the scanning of -// the internal subset and the scanning of the external subset, if any. -// -// When we get here the 'resetDocType(); - - // There must be some space after DOCTYPE - bool skippedSomething; - fReaderMgr.skipPastSpaces(skippedSomething); - if (!skippedSomething) - { - emitError(XMLErrs::ExpectedWhitespace); - - // Just skip the Doctype declaration and return - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Get a buffer for the root element - XMLBufBid bbRootName(&fBufMgr); - - // Get a name from the input, which should be the name of the root - // element of the upcoming content. - int colonPosition; - bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) : - fReaderMgr.getName(bbRootName.getBuffer()); - if (!validName) - { - if (bbRootName.isEmpty()) - emitError(XMLErrs::NoRootElemInDOCTYPE); - else - emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer()); - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Store the root element name for later check - setRootElemName(bbRootName.getRawBuffer()); - - // This element obviously is not going to exist in the element decl - // pool yet, but we need to call docTypeDecl. So force it into - // the element decl pool, marked as being there because it was in - // the DOCTYPE. Later, when its declared, the status will be updated. - // - // Only do this if we are not reusing the validator! If we are reusing, - // then look it up instead. It has to exist! - MemoryManager* const rootDeclMgr = - fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager; - - DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl - ( - bbRootName.getRawBuffer() - , fEmptyNamespaceId - , DTDElementDecl::Any - , rootDeclMgr - ); - - Janitor rootDeclJanitor(rootDecl); - rootDecl->setCreateReason(DTDElementDecl::AsRootElem); - rootDecl->setExternalElemDeclaration(true); - if(!fUseCachedGrammar) - { - fGrammar->putElemDecl(rootDecl); - rootDeclJanitor.release(); - } else - { - // attach this to the undeclared element pool so that it gets deleted - XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer()); - if (elemDecl) - { - rootDecl->setId(elemDecl->getId()); - } - else - { - rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl)); - rootDeclJanitor.release(); - } - } - - // Skip any spaces after the name - fReaderMgr.skipPastSpaces(); - - // And now if we are looking at a >, then we are done. It is not - // required to have an internal or external subset, though why you - // would not escapes me. - if (fReaderMgr.skippedChar(chCloseAngle)) { - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) - fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false); - return; - } - - // either internal/external subset - if (fValScheme == Val_Auto && !fValidate) - fValidate = true; - - bool hasIntSubset = false; - bool hasExtSubset = false; - XMLCh* sysId = 0; - XMLCh* pubId = 0; - - DTDScanner dtdScanner - ( - (DTDGrammar*) fGrammar - , fDocTypeHandler - , fGrammarPoolMemoryManager - , fMemoryManager - ); - dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); - - // If the next character is '[' then we have no external subset cause - // there is no system id, just the opening character of the internal - // subset. Else, has to be an id. - // - // Just look at the next char, don't eat it. - if (fReaderMgr.peekNextChar() == chOpenSquare) - { - hasIntSubset = true; - } - else - { - // Indicate we have an external subset - hasExtSubset = true; - fHasNoDTD = false; - - // Get buffers for the ids - XMLBufBid bbPubId(&fBufMgr); - XMLBufBid bbSysId(&fBufMgr); - - // Get the external subset id - if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External)) - { - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Get copies of the ids we got - pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager); - sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager); - } - - // Insure that the ids get cleaned up, if they got allocated - ArrayJanitor janSysId(sysId, fMemoryManager); - ArrayJanitor janPubId(pubId, fMemoryManager); - - if (hasExtSubset) - { - // Skip spaces and check again for the opening of an internal subset - fReaderMgr.skipPastSpaces(); - - // Just look at the next char, don't eat it. - if (fReaderMgr.peekNextChar() == chOpenSquare) { - hasIntSubset = true; - } - } - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) - fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset); - - // Ok, if we had an internal subset, we are just past the [ character - // and need to parse that first. - if (hasIntSubset) - { - // Eat the opening square bracket - fReaderMgr.getNextChar(); - - checkInternalDTD(hasExtSubset, sysId, pubId); - - // And try to scan the internal subset. If we fail, try to recover - // by skipping forward tot he close angle and returning. - if (!dtdScanner.scanInternalSubset()) - { - fReaderMgr.skipPastChar(chCloseAngle); - return; - } - - // Do a sanity check that some expanded PE did not propogate out of - // the doctype. This could happen if it was terminated early by bad - // syntax. - if (fReaderMgr.getReaderDepth() > 1) - { - emitError(XMLErrs::PEPropogated); - - // Ask the reader manager to pop back down to the main level - fReaderMgr.cleanStackBackTo(1); - } - - fReaderMgr.skipPastSpaces(); - } - - // And that should leave us at the closing > of the DOCTYPE line - if (!fReaderMgr.skippedChar(chCloseAngle)) - { - // Do a special check for the common scenario of an extra ] char at - // the end. This is easy to recover from. - if (fReaderMgr.skippedChar(chCloseSquare) - && fReaderMgr.skippedChar(chCloseAngle)) - { - emitError(XMLErrs::ExtraCloseSquare); - } - else - { - emitError(XMLErrs::UnterminatedDOCTYPE); - fReaderMgr.skipPastChar(chCloseAngle); - } - } - - // If we had an external subset, then we need to deal with that one - // next. If we are reusing the validator, then don't scan it. - if (hasExtSubset) { - - InputSource* srcUsed=0; - Janitor janSrc(srcUsed); - // If we had an internal subset and we're using the cached grammar, it - // means that the ignoreCachedDTD is set, so we ignore the cached - // grammar - if (fUseCachedGrammar && !hasIntSubset) - { - srcUsed = resolveSystemId(sysId, pubId); - if (srcUsed) { - janSrc.reset(srcUsed); - Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId()); - - if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) { - - fDTDGrammar = (DTDGrammar*) grammar; - fGrammar = fDTDGrammar; - fValidator->setGrammar(fGrammar); - // If we don't report at least the external subset boundaries, - // an advanced document handler cannot know when the DTD end, - // since we've already sent a doctype decl that indicates there's - // there's an external subset. - if (fDocTypeHandler) - { - fDocTypeHandler->startExtSubset(); - fDocTypeHandler->endExtSubset(); - } - - return; - } - } - } - - if (fLoadExternalDTD || fValidate) - { - // And now create a reader to read this entity - XMLReader* reader; - if (srcUsed) { - reader = fReaderMgr.createReader - ( - *srcUsed - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , fCalculateSrcOfs - , fLowWaterMark - ); - } - else { - reader = fReaderMgr.createReader - ( - sysId - , pubId - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , srcUsed - , fCalculateSrcOfs - , fLowWaterMark - , fDisableDefaultEntityResolution - ); - janSrc.reset(srcUsed); - } - // If it failed then throw an exception - if (!reader) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager); - - if (fToCacheGrammar) { - - unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId()); - const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId); - - fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); - ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); - fGrammarResolver->putGrammar(fGrammar); - } - - // In order to make the processing work consistently, we have to - // make this look like an external entity. So create an entity - // decl and fill it in and push it with the reader, as happens - // with an external entity. Put a janitor on it to insure it gets - // cleaned up. The reader manager does not adopt them. - const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; - DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); - declDTD->setSystemId(sysId); - declDTD->setIsExternal(true); - - // Mark this one as a throw at end - reader->setThrowAtEnd(true); - - // And push it onto the stack, with its pseudo name - fReaderMgr.pushReader(reader, declDTD); - - // Tell it its not in an include section - dtdScanner.scanExtSubsetDecl(false, true); - } - } -} - -bool IGXMLScanner::scanStartTag(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the root and its empty. - gotData = true; - - // Get the QName. In this case, we are not doing namespaces, so we just - // use it as is and don't have to break it into parts. - if (!fReaderMgr.getName(fQNameBuf)) - { - emitError(XMLErrs::ExpectedElementName); - fReaderMgr.skipToChar(chOpenAngle); - return false; - } - - // Assume it won't be an empty tag - bool isEmpty = false; - - // Lets try to look up the element in the validator's element decl pool - // We can pass bogus values for the URI id and the base name. We know that - // this can only be called if we are doing a DTD style validator and that - // he will only look at the QName. - // - // We tell him to fault in a decl if he does not find one. - // Actually, we *don't* tell him to fault in a decl if he does not find one- NG - bool wasAdded = false; - const XMLCh *rawQName = fQNameBuf.getRawBuffer(); - XMLElementDecl* elemDecl = fGrammar->getElemDecl - ( - fEmptyNamespaceId - , 0 - , rawQName - , Grammar::TOP_LEVEL_SCOPE - ); - // look for it in the undeclared pool: - if(!elemDecl) - { - elemDecl = fDTDElemNonDeclPool->getByKey(rawQName); - } - if(!elemDecl) - { - // we're assuming this must be a DTD element. DTD's can be - // used with or without namespaces, but schemas cannot be used without - // namespaces. - wasAdded = true; - elemDecl = new (fMemoryManager) DTDElementDecl - ( - rawQName - , fEmptyNamespaceId - , DTDElementDecl::Any - , fMemoryManager - ); - elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); - } - - // We do something different here according to whether we found the - // element or not. - if (wasAdded) - { - // If validating then emit an error - if (fValidate) - { - // This is to tell the reuse Validator that this element was - // faulted-in, was not an element in the validator pool originally - elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); - - fValidator->emitError - ( - XMLValid::ElementNotDefined - , elemDecl->getFullName() - ); - } - } - else - { - // If its not marked declared and validating, then emit an error - if (fValidate && !elemDecl->isDeclared()) - { - fValidator->emitError - ( - XMLValid::ElementNotDefined - , elemDecl->getFullName() - ); - } - } - - // See if its the root element - const bool isRoot = fElemStack.isEmpty(); - - // Expand the element stack and add the new element - fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); - fElemStack.setValidationFlag(fValidate); - - // Validate the element - if (fValidate) - fValidator->validateElement(elemDecl); - - // If this is the first element and we are validating, check the root - // element. - if (isRoot) - { - fRootGrammar = fGrammar; - - if (fValidate) - { - // If a DocType exists, then check if it matches the root name there. - if (fRootElemName && !XMLString::equals(fQNameBuf.getRawBuffer(), fRootElemName)) - fValidator->emitError(XMLValid::RootElemNotLikeDocType); - } - } - else - { - // If the element stack is not empty, then add this element as a - // child of the previous top element. If its empty, this is the root - // elem and is not the child of anything. - fElemStack.addChild(elemDecl->getElementName(), true); - } - - // Skip any whitespace after the name - fReaderMgr.skipPastSpaces(); - - // We loop until we either see a /> or >, handling attribute/value - // pairs until we get there. - XMLSize_t attCount = 0; - XMLSize_t curAttListSize = fAttrList->size(); - wasAdded = false; - - fElemCount++; - - while (true) - { - // And get the next non-space character - XMLCh nextCh = fReaderMgr.peekNextChar(); - - // If the next character is not a slash or closed angle bracket, - // then it must be whitespace, since whitespace is required - // between the end of the last attribute and the name of the next - // one. - if (attCount) - { - if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) - { - bool bFoundSpace; - fReaderMgr.skipPastSpaces(bFoundSpace); - if (!bFoundSpace) - { - // Emit the error but keep on going - emitError(XMLErrs::ExpectedWhitespace); - } - // Ok, peek another char - nextCh = fReaderMgr.peekNextChar(); - } - } - - // Ok, here we first check for any of the special case characters. - // If its not one, then we do the normal case processing, which - // assumes that we've hit an attribute value, Otherwise, we do all - // the special case checks. - if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) - { - // Assume its going to be an attribute, so get a name from - // the input. - if (!fReaderMgr.getName(fAttNameBuf)) - { - emitError(XMLErrs::ExpectedAttrName); - fReaderMgr.skipPastChar(chCloseAngle); - return false; - } - - // And next must be an equal sign - if (!scanEq()) - { - static const XMLCh tmpList[] = - { - chSingleQuote, chDoubleQuote, chCloseAngle - , chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedEqSign); - - // Try to sync back up by skipping forward until we either - // hit something meaningful. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) - { - // Jump back to top for normal processing of these - continue; - } - else if ((chFound == chSingleQuote) - || (chFound == chDoubleQuote) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through assuming that the value is to follow - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - // See if this attribute is declared for this element. If we are - // not validating of course it will not be at first, but we will - // fault it into the pool (to avoid lots of redundant errors.) - XMLCh * namePtr = fAttNameBuf.getRawBuffer(); - XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); - - // Add this attribute to the attribute list that we use to - // pass them to the handler. We reuse its existing elements - // but expand it as required. - // Note that we want to this first since this will - // make a copy of the namePtr; we can then make use of - // that copy in the hashtable lookup that checks - // for duplicates. This will mean we may have to update - // the type of the XMLAttr later. - XMLAttr* curAtt; - if (attCount >= curAttListSize) - { - curAtt = new (fMemoryManager) XMLAttr - ( - 0 - , namePtr - , XMLUni::fgZeroLenString - , XMLUni::fgZeroLenString - , (attDef)?attDef->getType():XMLAttDef::CData - , true - , fMemoryManager - ); - fAttrList->addElement(curAtt); - } - else - { - curAtt = fAttrList->elementAt(attCount); - curAtt->set - ( - 0 - , namePtr - , XMLUni::fgZeroLenString - , XMLUni::fgZeroLenString - , (attDef)?attDef->getType():XMLAttDef::CData - ); - curAtt->setSpecified(true); - } - // reset namePtr so it refers to newly-allocated memory - namePtr = (XMLCh *)curAtt->getName(); - - if (!attDef) - { - // If there is a validation handler, then we are validating - // so emit an error. - if (fValidate) - { - fValidator->emitError - ( - XMLValid::AttNotDefinedForElement - , fAttNameBuf.getRawBuffer() - , elemDecl->getFullName() - ); - } - if(!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0)) - { - emitError - ( - XMLErrs::AttrAlreadyUsedInSTag - , namePtr - , elemDecl->getFullName() - ); - } - } - else - { - // prepare for duplicate detection - unsigned int *curCountPtr = fAttDefRegistry->get(attDef); - if(!curCountPtr) - { - curCountPtr = getNewUIntPtr(); - *curCountPtr = fElemCount; - fAttDefRegistry->put(attDef, curCountPtr); - } - else if(*curCountPtr < fElemCount) - *curCountPtr = fElemCount; - else - { - emitError - ( - XMLErrs::AttrAlreadyUsedInSTag - , attDef->getFullName() - , elemDecl->getFullName() - ); - } - } - - // Skip any whitespace before the value and then scan the att - // value. This will come back normalized with entity refs and - // char refs expanded. - fReaderMgr.skipPastSpaces(); - if (!scanAttValue(attDef, namePtr, fAttValueBuf)) - { - static const XMLCh tmpList[] = - { - chCloseAngle, chOpenAngle, chForwardSlash, chNull - }; - - emitError(XMLErrs::ExpectedAttrValue); - - // It failed, so lets try to get synced back up. We skip - // forward until we find some whitespace or one of the - // chars in our list. - const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); - - if ((chFound == chCloseAngle) - || (chFound == chForwardSlash) - || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) - { - // Just fall through and process this attribute, though - // the value will be "". - } - else if (chFound == chOpenAngle) - { - // Assume a malformed tag and that new one is starting - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - return false; - } - else - { - // Something went really wrong - return false; - } - } - // must set the newly-minted value on the XMLAttr: - curAtt->setValue(fAttValueBuf.getRawBuffer()); - - // Now that its all stretched out, lets look at its type and - // determine if it has a valid value. It will output any needed - // errors, but we just keep going. We only need to do this if - // we are validating. - if (attDef) - { - // Let the validator pass judgement on the attribute value - if (fValidate) - { - fValidator->validateAttrValue - ( - attDef - , fAttValueBuf.getRawBuffer() - , false - , elemDecl - ); - } - } - - attCount++; - // And jump back to the top of the loop - continue; - } - - // It was some special case character so do all of the checks and - // deal with it. - if (!nextCh) - ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); - - if (nextCh == chForwardSlash) - { - fReaderMgr.getNextChar(); - isEmpty = true; - if (!fReaderMgr.skippedChar(chCloseAngle)) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if (nextCh == chCloseAngle) - { - fReaderMgr.getNextChar(); - break; - } - else if (nextCh == chOpenAngle) - { - // Check for this one specially, since its going to be common - // and it is kind of auto-recovering since we've already hit the - // next open bracket, which is what we would have seeked to (and - // skipped this whole tag.) - emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); - break; - } - else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) - { - // Check for this one specially, which is probably a missing - // attribute name, e.g. ="value". Just issue expected name - // error and eat the quoted string, then jump back to the - // top again. - emitError(XMLErrs::ExpectedAttrName); - fReaderMgr.getNextChar(); - fReaderMgr.skipQuotedString(nextCh); - fReaderMgr.skipPastSpaces(); - continue; - } - } - - if(attCount) - { - // clean up after ourselves: - // clear the map used to detect duplicate attributes - fUndeclaredAttrRegistry->removeAll(); - } - - // Ok, so lets get an enumerator for the attributes of this element - // and run through them for well formedness and validity checks. But - // make sure that we had any attributes before we do it, since the list - // would have have gotten faulted in anyway. - if (elemDecl->hasAttDefs()) - { - // N.B.: this assumes DTD validation. - XMLAttDefList& attDefList = elemDecl->getAttDefList(); - for(XMLSize_t i=0; iget(&curDef); - if (!attCountPtr || *attCountPtr < fElemCount) - { // did not occur - if (fValidate) - { - // If we are validating and its required, then an error - if (defType == XMLAttDef::Required) - { - fValidator->emitError - ( - XMLValid::RequiredAttrNotProvided - , curDef.getFullName() - ); - } - else if ((defType == XMLAttDef::Default) || - (defType == XMLAttDef::Fixed) ) - { - if (fStandalone && curDef.isExternal()) - { - // XML 1.0 Section 2.9 - // Document is standalone, so attributes must not be defaulted. - fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName()); - - } - } - } - - // Fault in the value if needed, and bump the att count - if ((defType == XMLAttDef::Default) - || (defType == XMLAttDef::Fixed)) - { - // Let the validator pass judgement on the attribute value - if (fValidate) - { - fValidator->validateAttrValue - ( - &curDef - , curDef.getValue() - , false - , elemDecl - ); - } - - XMLAttr* curAtt; - if (attCount >= curAttListSize) - { - curAtt = new (fMemoryManager) XMLAttr - ( - 0 - , curDef.getFullName() - , XMLUni::fgZeroLenString - , curDef.getValue() - , curDef.getType() - , false - , fMemoryManager - ); - fAttrList->addElement(curAtt); - curAttListSize++; - } - else - { - curAtt = fAttrList->elementAt(attCount); - curAtt->set - ( - 0 - , curDef.getFullName() - , XMLUni::fgZeroLenString - , curDef.getValue() - , curDef.getType() - ); - curAtt->setSpecified(false); - } - attCount++; - } - } - } - } - - // If empty, validate content right now if we are validating and then - // pop the element stack top. Else, we have to update the current stack - // top's namespace mapping elements. - if (isEmpty) - { - // If validating, then insure that its legal to have no content - if (fValidate) - { - XMLSize_t failure; - bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); - if (!res) - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , elemDecl->getFullName() - , elemDecl->getFormattedContentModel() - ); - } - } - - // Pop the element stack back off since it'll never be used now - fElemStack.popTop(); - - // If the elem stack is empty, then it was an empty root - if (isRoot) - gotData = false; - else { - // Restore the validation flag - fValidate = fElemStack.getValidationFlag(); - } - } - - // If we have a document handler, then tell it about this start tag. We - // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send - // any prefix since its just one big name if we are not doing namespaces. - if (fDocHandler) - { - fDocHandler->startElement - ( - *elemDecl - , fEmptyNamespaceId - , 0 - , *fAttrList - , attCount - , isEmpty - , isRoot - ); - } - - return true; -} - - -// This method is called to scan a start tag when we are processing -// namespaces. There are two different versions of this method, one for -// namespace aware processing and one for non-namespace aware processing. -// -// This method is called after we've scanned the < of a start tag. So we -// have to get the element name, then scan the attributes, after which -// we are either going to see >, />, or attributes followed by one of those -// sequences. -bool IGXMLScanner::scanStartTagNS(bool& gotData) -{ - // Assume we will still have data until proven otherwise. It will only - // ever be false if this is the root and its empty. - gotData = true; - - // Reset element content buffer - fContent.reset(); - - // The current position is after the open bracket, so we need to read in - // in the element name. - int prefixColonPos; - if (!fReaderMgr.getQName(fQNameBuf, &prefixColonPos)) - { - if (fQNameBuf.isEmpty()) - emitError(XMLErrs::ExpectedElementName); - else - emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); - fReaderMgr.skipToChar(chOpenAngle); - return false; - } - - // See if its the root element - const bool isRoot = fElemStack.isEmpty(); - - // Skip any whitespace after the name - fReaderMgr.skipPastSpaces(); - - // First we have to do the rawest attribute scan. We don't do any - // normalization of them at all, since we don't know yet what type they - // might be (since we need the element decl in order to do that.) - bool isEmpty; - XMLSize_t attCount = rawAttrScan - ( - fQNameBuf.getRawBuffer() - , *fRawAttrList - , isEmpty - ); - - // save the contentleafname and currentscope before addlevel, for later use - ContentLeafNameTypeVector* cv = 0; - XMLContentModel* cm = 0; - unsigned int currentScope = Grammar::TOP_LEVEL_SCOPE; - bool laxThisOne = false; - - if (!isRoot && fGrammarType == Grammar::SchemaGrammarType) - { - // schema validator will have correct type if validating - SchemaElementDecl* tempElement = (SchemaElementDecl*) - fElemStack.topElement()->fThisElement; - SchemaElementDecl::ModelTypes modelType = tempElement->getModelType(); - ComplexTypeInfo *currType = 0; - - if (fValidate) - { - currType = ((SchemaValidator*)fValidator)->getCurrentTypeInfo(); - if (currType) - modelType = (SchemaElementDecl::ModelTypes)currType->getContentType(); - else // something must have gone wrong - modelType = SchemaElementDecl::Any; - } - else - { - currType = tempElement->getComplexTypeInfo(); - } - - if ((modelType == SchemaElementDecl::Mixed_Simple) - || (modelType == SchemaElementDecl::Mixed_Complex) - || (modelType == SchemaElementDecl::Children)) - { - cm = currType->getContentModel(); - cv = cm->getContentLeafNameTypeVector(); - currentScope = fElemStack.getCurrentScope(); - } - else if (modelType == SchemaElementDecl::Any) { - laxThisOne = true; - } - } - - // Now, since we might have to update the namespace map for this element, - // but we don't have the element decl yet, we just tell the element stack - // to expand up to get ready. - XMLSize_t elemDepth = fElemStack.addLevel(); - fElemStack.setValidationFlag(fValidate); - fElemStack.setPrefixColonPos(prefixColonPos); - - // Check if there is any external schema location specified, and if we are at root, - // go through them first before scanning those specified in the instance document - if (isRoot && fDoSchema - && (fExternalSchemaLocation || fExternalNoNamespaceSchemaLocation)) { - - if (fExternalSchemaLocation) - parseSchemaLocation(fExternalSchemaLocation, true); - if (fExternalNoNamespaceSchemaLocation) - resolveSchemaGrammar(fExternalNoNamespaceSchemaLocation, XMLUni::fgZeroLenString, true); - } - - // Make an initial pass through the list and find any xmlns attributes or - // schema attributes. - if (attCount) { - scanRawAttrListforNameSpaces(attCount); - } - - // Also find any default or fixed xmlns attributes in DTD defined for - // this element. - XMLElementDecl* elemDecl = 0; - const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); - - if (fGrammarType == Grammar::DTDGrammarType) { - - if (!fSkipDTDValidation) { - elemDecl = fGrammar->getElemDecl( - fEmptyNamespaceId, 0, qnameRawBuf, Grammar::TOP_LEVEL_SCOPE - ); - - if (elemDecl) { - if (elemDecl->hasAttDefs()) { - XMLAttDefList& attDefList = elemDecl->getAttDefList(); - for(XMLSize_t i=0; igetByKey(qnameRawBuf); - } - } - - // Resolve the qualified name to a URI and name so that we can look up - // the element decl for this element. We have now update the prefix to - // namespace map so we should get the correct element now. - unsigned int uriId = resolveQNameWithColon( - qnameRawBuf, fPrefixBuf, ElemStack::Mode_Element, prefixColonPos - ); - - //if schema, check if we should lax or skip the validation of this element - bool parentValidation = fValidate; - if (cv) { - QName element(fPrefixBuf.getRawBuffer(), &qnameRawBuf[prefixColonPos + 1], uriId, fMemoryManager); - // elementDepth will be > 0, as cv is only constructed if element is not - // root. - laxThisOne = laxElementValidation(&element, cv, cm, elemDepth - 1); - } - - // Look up the element now in the grammar. This will get us back a - // generic element decl object. We tell him to fault one in if he does - // not find it. - bool wasAdded = false; - const XMLCh* nameRawBuf = &qnameRawBuf[prefixColonPos + 1]; - - if (fDoSchema) { - - if (fGrammarType == Grammar::DTDGrammarType) { - if (!switchGrammar(getURIText(uriId))) { - fValidator->emitError( - XMLValid::GrammarNotFound, getURIText(uriId) - ); - } - } - - if (fGrammarType == Grammar::SchemaGrammarType) { - elemDecl = fGrammar->getElemDecl( - uriId, nameRawBuf, qnameRawBuf, currentScope - ); - - // if not found, then it may be a reference, try TOP_LEVEL_SCOPE - if (!elemDecl) { - bool checkTopLevel = (currentScope != Grammar::TOP_LEVEL_SCOPE); - const XMLCh* original_uriStr = fGrammar->getTargetNamespace(); - unsigned int orgGrammarUri = fURIStringPool->getId(original_uriStr); - - if (orgGrammarUri != uriId) { - if (switchGrammar(getURIText(uriId))) { - checkTopLevel = true; - } - else { - // the laxElementValidation routine (called above) will - // set fValidate to false for a "skipped" element - if (!laxThisOne && fValidate) { - fValidator->emitError( - XMLValid::GrammarNotFound, getURIText(uriId) - ); - } - checkTopLevel = false; - } - } - - if (checkTopLevel) { - elemDecl = fGrammar->getElemDecl( - uriId, nameRawBuf, qnameRawBuf, Grammar::TOP_LEVEL_SCOPE - ); - } - - if (!elemDecl && currentScope != Grammar::TOP_LEVEL_SCOPE) { - - if (orgGrammarUri == uriId) { - // still not found in specified uri - // try emptyNamespace see if element should be - // un-qualified. - // Use a temp variable until we decide this is the case - if (uriId != fEmptyNamespaceId) { - XMLElementDecl* tempElemDecl = fGrammar->getElemDecl( - fEmptyNamespaceId, nameRawBuf, qnameRawBuf, currentScope - ); - - if (tempElemDecl && tempElemDecl->getCreateReason() != XMLElementDecl::JustFaultIn && fValidate) { - fValidator->emitError( - XMLValid::ElementNotUnQualified, qnameRawBuf - ); - elemDecl = tempElemDecl; - } - } - } - // still Not found in specified uri - // go to original Grammar again to see if element needs - // to be fully qualified. - // Use a temp variable until we decide this is the case - else if (uriId == fEmptyNamespaceId) { - - if (switchGrammar(original_uriStr)) { - XMLElementDecl* tempElemDecl = fGrammar->getElemDecl( - orgGrammarUri, nameRawBuf, qnameRawBuf, currentScope - ); - if (tempElemDecl && tempElemDecl->getCreateReason() != XMLElementDecl::JustFaultIn && fValidate) { - fValidator->emitError( - XMLValid::ElementNotQualified, qnameRawBuf - ); - elemDecl = tempElemDecl; - } - } - else if (!laxThisOne && fValidate) { - fValidator->emitError( - XMLValid::GrammarNotFound,original_uriStr - ); - } - } - } - - if (!elemDecl) { - // still not found - // switch back to original grammar first if necessary - if (orgGrammarUri != uriId) { - switchGrammar(original_uriStr); - } - - // look in the list of undeclared elements, as would have been - // done before we made grammars stateless: - elemDecl = fSchemaElemNonDeclPool->getByKey( - nameRawBuf, uriId, (int)Grammar::TOP_LEVEL_SCOPE - ); - } - } - } - } - - if (!elemDecl) { - - if (fGrammarType == Grammar::DTDGrammarType) { - elemDecl = new (fMemoryManager) DTDElementDecl( - qnameRawBuf, uriId, DTDElementDecl::Any, fMemoryManager - ); - elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); - } - else if (fGrammarType == Grammar::SchemaGrammarType) { - elemDecl = new (fMemoryManager) SchemaElementDecl( - fPrefixBuf.getRawBuffer(), nameRawBuf, uriId - , SchemaElementDecl::Any, Grammar::TOP_LEVEL_SCOPE - , fMemoryManager - ); - elemDecl->setId( - fSchemaElemNonDeclPool->put((void*)elemDecl->getBaseName() - , uriId, (int)Grammar::TOP_LEVEL_SCOPE, (SchemaElementDecl*)elemDecl) - ); - } else { - fValidator->emitError( - XMLValid::GrammarNotFound, getURIText(uriId) - ); - } - wasAdded = true; - } - - // this info needed for DOMTypeInfo - fPSVIElemContext.fErrorOccurred = false; - - // We do something different here according to whether we found the - // element or not. - bool bXsiTypeSet= (fValidator && fGrammarType == Grammar::SchemaGrammarType)?((SchemaValidator*)fValidator)->getIsXsiTypeSet():false; - if (wasAdded) - { - if (laxThisOne && !bXsiTypeSet) { - fValidate = false; - fElemStack.setValidationFlag(fValidate); - } - else if (fValidate) - { - // If validating then emit an error - - // This is to tell the reuse Validator that this element was - // faulted-in, was not an element in the grammar pool originally - elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); - - // xsi:type was specified, don't complain about missing definition - if(!bXsiTypeSet) - { - fValidator->emitError - ( - XMLValid::ElementNotDefined - , elemDecl->getFullName() - ); - - if(fGrammarType == Grammar::SchemaGrammarType) - { - fPSVIElemContext.fErrorOccurred = true; - } - } - } - } - else - { - // If its not marked declared and validating, then emit an error - if (!elemDecl->isDeclared()) { - if(elemDecl->getCreateReason() == XMLElementDecl::NoReason) { - if(!bXsiTypeSet && fGrammarType == Grammar::SchemaGrammarType) { - fPSVIElemContext.fErrorOccurred = true; - } - } - - if (laxThisOne) { - fValidate = false; - fElemStack.setValidationFlag(fValidate); - } - else if (fValidate && !bXsiTypeSet) - { - fValidator->emitError - ( - XMLValid::ElementNotDefined - , elemDecl->getFullName() - ); - } - } - } - - // Now we can update the element stack to set the current element - // decl. We expanded the stack above, but couldn't store the element - // decl because we didn't know it yet. - fElemStack.setElement(elemDecl, fReaderMgr.getCurrentReaderNum()); - fElemStack.setCurrentURI(uriId); - - if (isRoot) - { - fRootGrammar = fGrammar; - if (fGrammarType == Grammar::SchemaGrammarType && !fRootElemName) - fRootElemName = XMLString::replicate(qnameRawBuf, fMemoryManager); - } - - if (fGrammarType == Grammar::SchemaGrammarType && fPSVIHandler) - { - - fPSVIElemContext.fElemDepth++; - if (elemDecl->isDeclared()) - { - fPSVIElemContext.fNoneValidationDepth = fPSVIElemContext.fElemDepth; - } - else - { - fPSVIElemContext.fFullValidationDepth = fPSVIElemContext.fElemDepth; - - /****** - * While we report an error for historical reasons, this should - * actually result in lax assessment - NG. - if (isRoot && fValidate) - fPSVIElemContext.fErrorOccurred = true; - *****/ - } - } - - // Validate the element - if (fValidate) - { - fValidator->validateElement(elemDecl); - if (fValidator->handlesSchema()) - { - if (((SchemaValidator*) fValidator)->getErrorOccurred()) - fPSVIElemContext.fErrorOccurred = true; - } - } - - if (fGrammarType == Grammar::SchemaGrammarType) { - - // squirrel away the element's QName, so that we can do an efficient - // end-tag match - fElemStack.setCurrentSchemaElemName(fQNameBuf.getRawBuffer()); - - ComplexTypeInfo* typeinfo = (fValidate) - ? ((SchemaValidator*)fValidator)->getCurrentTypeInfo() - : ((SchemaElementDecl*) elemDecl)->getComplexTypeInfo(); - - if (typeinfo) { - currentScope = typeinfo->getScopeDefined(); - - // switch grammar if the typeinfo has a different grammar (happens when there is xsi:type) - XMLCh* typeName = typeinfo->getTypeName(); - const int comma = XMLString::indexOf(typeName, chComma); - if (comma > 0) { - XMLBuffer prefixBuf(comma+1, fMemoryManager); - prefixBuf.append(typeName, comma); - const XMLCh* uriStr = prefixBuf.getRawBuffer(); - - bool errorCondition = !switchGrammar(uriStr) && fValidate; - if (errorCondition && !laxThisOne) - { - fValidator->emitError - ( - XMLValid::GrammarNotFound - , prefixBuf.getRawBuffer() - ); - } - } - else if (comma == 0) { - bool errorCondition = !switchGrammar(XMLUni::fgZeroLenString) && fValidate; - if (errorCondition && !laxThisOne) - { - fValidator->emitError - ( - XMLValid::GrammarNotFound - , XMLUni::fgZeroLenString - ); - } - } - } - fElemStack.setCurrentScope(currentScope); - - // Set element next state - if (elemDepth >= fElemStateSize) { - resizeElemState(); - } - - fElemState[elemDepth] = 0; - fElemLoopState[elemDepth] = 0; - } - - fElemStack.setCurrentGrammar(fGrammar); - - // If this is the first element and we are validating, check the root - // element. - if (isRoot) - { - if (fValidate) - { - // If a DocType exists, then check if it matches the root name there. - if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) - fValidator->emitError(XMLValid::RootElemNotLikeDocType); - } - } - else if (parentValidation) - { - // If the element stack is not empty, then add this element as a - // child of the previous top element. If its empty, this is the root - // elem and is not the child of anything. - fElemStack.addChild(elemDecl->getElementName(), true); - } - - // PSVI handling: even if it turns out there are - // no attributes, we need to reset this list... - if(getPSVIHandler() && fGrammarType == Grammar::SchemaGrammarType ) - fPSVIAttrList->reset(); - - // Now lets get the fAttrList filled in. This involves faulting in any - // defaulted and fixed attributes and normalizing the values of any that - // we got explicitly. - // - // We update the attCount value with the total number of attributes, but - // it goes in with the number of values we got during the raw scan of - // explictly provided attrs above. - attCount = buildAttList(*fRawAttrList, attCount, elemDecl, *fAttrList); - if(attCount) - { - // clean up after ourselves: - // clear the map used to detect duplicate attributes - fUndeclaredAttrRegistry->removeAll(); - } - - // activate identity constraints - if (fGrammar && - fGrammarType == Grammar::SchemaGrammarType && - toCheckIdentityConstraint()) - { - fICHandler->activateIdentityConstraint - ( - (SchemaElementDecl*) elemDecl - , (int) elemDepth - , uriId - , fPrefixBuf.getRawBuffer() - , *fAttrList - , attCount - , fValidationContext - ); - } - - // Since the element may have default values, call start tag now regardless if it is empty or not - // If we have a document handler, then tell it about this start tag - if (fDocHandler) - { - fDocHandler->startElement - ( - *elemDecl - , uriId - , fPrefixBuf.getRawBuffer() - , *fAttrList - , attCount - , false - , isRoot - ); - } - - // if we have a PSVIHandler, now's the time to call - // its handleAttributesPSVI method: - if(fPSVIHandler && fGrammarType == Grammar::SchemaGrammarType) - { - QName *eName = elemDecl->getElementName(); - fPSVIHandler->handleAttributesPSVI - ( - eName->getLocalPart() - , fURIStringPool->getValueForId(eName->getURI()) - , fPSVIAttrList - ); - } - - // If empty, validate content right now if we are validating and then - // pop the element stack top. Else, we have to update the current stack - // top's namespace mapping elements. - if (isEmpty) - { - // Pop the element stack back off since it'll never be used now - fElemStack.popTop(); - - // reset current type info - DatatypeValidator* psviMemberType = 0; - if (fGrammarType == Grammar::SchemaGrammarType) - { - if (fValidate && elemDecl->isDeclared()) - { - fPSVIElemContext.fCurrentTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); - if(!fPSVIElemContext.fCurrentTypeInfo) - fPSVIElemContext.fCurrentDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); - else - fPSVIElemContext.fCurrentDV = 0; - if(fPSVIHandler) - { - fPSVIElemContext.fNormalizedValue = ((SchemaValidator*) fValidator)->getNormalizedValue(); - - if (XMLString::equals(fPSVIElemContext.fNormalizedValue, XMLUni::fgZeroLenString)) - fPSVIElemContext.fNormalizedValue = 0; - } - } - else - { - fPSVIElemContext.fCurrentDV = 0; - fPSVIElemContext.fCurrentTypeInfo = 0; - fPSVIElemContext.fNormalizedValue = 0; - } - } - - // If validating, then insure that its legal to have no content - if (fValidate) - { - XMLSize_t failure; - bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); - if (!res) - { - fValidator->emitError - ( - XMLValid::ElementNotValidForContent - , elemDecl->getFullName() - , elemDecl->getFormattedContentModel() - ); - } - - if (fGrammarType == Grammar::SchemaGrammarType) { - - if (((SchemaValidator*) fValidator)->getErrorOccurred()) - { - fPSVIElemContext.fErrorOccurred = true; - } - else - { - if (fPSVIHandler) - { - fPSVIElemContext.fIsSpecified = ((SchemaValidator*) fValidator)->getIsElemSpecified(); - if(fPSVIElemContext.fIsSpecified) - fPSVIElemContext.fNormalizedValue = ((SchemaElementDecl *)elemDecl)->getDefaultValue(); - } - // note that if we're empty, won't be a current DV - if (fPSVIElemContext.fCurrentDV && fPSVIElemContext.fCurrentDV->getType() == DatatypeValidator::Union) - psviMemberType = fValidationContext->getValidatingMemberType(); - } - - // call matchers and de-activate context - if (toCheckIdentityConstraint()) - { - fICHandler->deactivateContext - ( - (SchemaElementDecl *) elemDecl - , fContent.getRawBuffer() - , fValidationContext - , fPSVIElemContext.fCurrentDV - ); - } - - } - } - else if (fGrammarType == Grammar::SchemaGrammarType) { - ((SchemaValidator*)fValidator)->resetNillable(); - } - - if (fGrammarType == Grammar::SchemaGrammarType) - { - if (fPSVIHandler) - { - endElementPSVI((SchemaElementDecl*)elemDecl, psviMemberType); - } - } - - // If we have a doc handler, tell it about the end tag - if (fDocHandler) - { - fDocHandler->endElement - ( - *elemDecl - , uriId - , isRoot - , fPrefixBuf.getRawBuffer() - ); - } - - // If the elem stack is empty, then it was an empty root - if (isRoot) - gotData = false; - else - { - // Restore the grammar - fGrammar = fElemStack.getCurrentGrammar(); - fGrammarType = fGrammar->getGrammarType(); - if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) { - if (fValidatorFromUser) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoSchemaValidator, fMemoryManager); - else { - fValidator = fSchemaValidator; - } - } - else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) { - if (fValidatorFromUser) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); - else { - fValidator = fDTDValidator; - } - } - - fValidator->setGrammar(fGrammar); - - // Restore the validation flag - fValidate = fElemStack.getValidationFlag(); - } - } - else if (fGrammarType == Grammar::SchemaGrammarType) - { - // send a partial element psvi - if (fPSVIHandler) - { - - ComplexTypeInfo* curTypeInfo = 0; - DatatypeValidator* curDV = 0; - XSTypeDefinition* typeDef = 0; - - if (fValidate && elemDecl->isDeclared()) - { - curTypeInfo = ((SchemaValidator*) fValidator)->getCurrentTypeInfo(); - - if (curTypeInfo) - { - typeDef = (XSTypeDefinition*) fModel->getXSObject(curTypeInfo); - } - else - { - curDV = ((SchemaValidator*) fValidator)->getCurrentDatatypeValidator(); - - if (curDV) - { - typeDef = (XSTypeDefinition*) fModel->getXSObject(curDV); - } - } - } - - fPSVIElement->reset - ( - PSVIElement::VALIDITY_NOTKNOWN - , PSVIElement::VALIDATION_NONE - , fRootElemName - , ((SchemaValidator*) fValidator)->getIsElemSpecified() - , (elemDecl->isDeclared()) ? (XSElementDeclaration*) fModel->getXSObject(elemDecl) : 0 - , typeDef - , 0 //memberType - , fModel - , ((SchemaElementDecl*)elemDecl)->getDefaultValue() - , 0 - , 0 - , 0 - ); - - - fPSVIHandler->handlePartialElementPSVI - ( - elemDecl->getBaseName() - , fURIStringPool->getValueForId(elemDecl->getURI()) - , fPSVIElement - ); - - } - - // not empty - fErrorStack->push(fPSVIElemContext.fErrorOccurred); - } - - return true; -} - - -// --------------------------------------------------------------------------- -// IGXMLScanner: Helper methos -// --------------------------------------------------------------------------- -void IGXMLScanner::resizeElemState() { - - unsigned int newSize = fElemStateSize * 2; - unsigned int* newElemState = (unsigned int*) fMemoryManager->allocate - ( - newSize * sizeof(unsigned int) - ); //new unsigned int[newSize]; - unsigned int* newElemLoopState = (unsigned int*) fMemoryManager->allocate - ( - newSize * sizeof(unsigned int) - ); //new unsigned int[newSize]; - - // Copy the existing values - unsigned int index = 0; - for (; index < fElemStateSize; index++) - { - newElemState[index] = fElemState[index]; - newElemLoopState[index] = fElemLoopState[index]; - } - - for (; index < newSize; index++) - newElemLoopState[index] = newElemState[index] = 0; - - // Delete the old array and udpate our members - fMemoryManager->deallocate(fElemState); //delete [] fElemState; - fMemoryManager->deallocate(fElemLoopState); //delete [] fElemState; - fElemState = newElemState; - fElemLoopState = newElemLoopState; - fElemStateSize = newSize; -} - -void IGXMLScanner::resizeRawAttrColonList() { - - unsigned int newSize = fRawAttrColonListSize * 2; - int* newRawAttrColonList = (int*) fMemoryManager->allocate - ( - newSize * sizeof(int) - ); //new int[newSize]; - - // Copy the existing values - unsigned int index = 0; - for (; index < fRawAttrColonListSize; index++) - newRawAttrColonList[index] = fRawAttrColonList[index]; - - // Delete the old array and udpate our members - fMemoryManager->deallocate(fRawAttrColonList); //delete [] fRawAttrColonList; - fRawAttrColonList = newRawAttrColonList; - fRawAttrColonListSize = newSize; -} - -// --------------------------------------------------------------------------- -// IGXMLScanner: Grammar preparsing -// --------------------------------------------------------------------------- -Grammar* IGXMLScanner::loadGrammar(const InputSource& src - , const short grammarType - , const bool toCache) -{ - Grammar* loadedGrammar = 0; - - ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); - - try - { - fGrammarResolver->cacheGrammarFromParse(false); - // if the new grammar has to be cached, better use the already cached - // grammars, or the an exception will be thrown when caching an already - // cached grammar - fGrammarResolver->useCachedGrammarInParse(toCache); - fRootGrammar = 0; - - if (fValScheme == Val_Auto) { - fValidate = true; - } - - // Reset some status flags - fInException = false; - fStandalone = false; - fErrorCount = 0; - fHasNoDTD = true; - fSeeXsi = false; - - if (grammarType == Grammar::SchemaGrammarType) { - loadedGrammar = loadXMLSchemaGrammar(src, toCache); - } - else if (grammarType == Grammar::DTDGrammarType) { - loadedGrammar = loadDTDGrammar(src, toCache); - } - } - // NOTE: - // - // In all of the error processing below, the emitError() call MUST come - // before the flush of the reader mgr, or it will fail because it tries - // to find out the position in the XML source of the error. - catch(const XMLErrs::Codes) - { - // This is a 'first fatal error' type exit, so fall through - } - catch(const XMLValid::Codes) - { - // This is a 'first fatal error' type exit, so fall through - } - catch(const XMLException& excToCatch) - { - // Emit the error and catch any user exception thrown from here. Make - // sure in all cases we flush the reader manager. - fInException = true; - try - { - if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) - emitError - ( - XMLErrs::XMLException_Warning - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) - emitError - ( - XMLErrs::XMLException_Fatal - , excToCatch.getCode() - , excToCatch.getMessage() - ); - else - emitError - ( - XMLErrs::XMLException_Error - , excToCatch.getCode() - , excToCatch.getMessage() - ); - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - } - catch(const OutOfMemoryException&) - { - // This is a special case for out-of-memory - // conditions, because resetting the ReaderMgr - // can be problematic. - resetReaderMgr.release(); - - throw; - } - - return loadedGrammar; -} - -void IGXMLScanner::resetCachedGrammar () -{ - fCachedSchemaInfoList->removeAll (); -} - -Grammar* IGXMLScanner::loadDTDGrammar(const InputSource& src, - const bool toCache) -{ - // Reset the validators - fDTDValidator->reset(); - if (fValidatorFromUser) - fValidator->reset(); - - if (!fValidator->handlesDTD()) { - if (fValidatorFromUser && fValidate) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); - else { - fValidator = fDTDValidator; - } - } - - fDTDGrammar = (DTDGrammar*) fGrammarResolver->getGrammar(XMLUni::fgDTDEntityString); - - if (fDTDGrammar) { - fDTDGrammar->reset(); - } - else { - fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); - fGrammarResolver->putGrammar(fDTDGrammar); - } - - fGrammar = fDTDGrammar; - fGrammarType = fGrammar->getGrammarType(); - fValidator->setGrammar(fGrammar); - - // And for all installed handlers, send reset events. This gives them - // a chance to flush any cached data. - if (fDocHandler) - fDocHandler->resetDocument(); - if (fEntityHandler) - fEntityHandler->resetEntities(); - if (fErrorReporter) - fErrorReporter->resetErrors(); - - // Clear out the id reference list - resetValidationContext(); - // and clear out the darned undeclared DTD element pool... - fDTDElemNonDeclPool->removeAll(); - - if (toCache) { - - unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId()); - const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId); - - fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); - ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); - fGrammarResolver->putGrammar(fGrammar); - } - - // Handle the creation of the XML reader object for this input source. - // This will provide us with transcoding and basic lexing services. - XMLReader* newReader = fReaderMgr.createReader - ( - src - , false - , XMLReader::RefFrom_NonLiteral - , XMLReader::Type_General - , XMLReader::Source_External - , fCalculateSrcOfs - , fLowWaterMark - ); - if (!newReader) { - if (src.getIssueFatalErrorIfNotFound()) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); - else - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); - } - - // In order to make the processing work consistently, we have to - // make this look like an external entity. So create an entity - // decl and fill it in and push it with the reader, as happens - // with an external entity. Put a janitor on it to insure it gets - // cleaned up. The reader manager does not adopt them. - const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; - DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); - declDTD->setSystemId(src.getSystemId()); - declDTD->setIsExternal(true); - - // Mark this one as a throw at end - newReader->setThrowAtEnd(true); - - // And push it onto the stack, with its pseudo name - fReaderMgr.pushReader(newReader, declDTD); - - // If we have a doc type handler and advanced callbacks are enabled, - // call the doctype event. - if (fDocTypeHandler) { - - // Create a dummy root - DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl - ( - gDTDStr - , fEmptyNamespaceId - , DTDElementDecl::Any - , fGrammarPoolMemoryManager - ); - rootDecl->setCreateReason(DTDElementDecl::AsRootElem); - rootDecl->setExternalElemDeclaration(true); - Janitor janSrc(rootDecl); - - fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true); - } - - // Create DTDScanner - DTDScanner dtdScanner - ( - (DTDGrammar*) fGrammar - , fDocTypeHandler - , fGrammarPoolMemoryManager - , fMemoryManager - ); - dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); - - // Tell it its not in an include section - dtdScanner.scanExtSubsetDecl(false, true); - - if (fValidate) { - // validate the DTD scan so far - fValidator->preContentValidation(false, true); - } - - if (toCache) - fGrammarResolver->cacheGrammars(); - - return fDTDGrammar; -} - -// --------------------------------------------------------------------------- -// IGXMLScanner: Helper methods -// --------------------------------------------------------------------------- -void IGXMLScanner::processSchemaLocation(XMLCh* const schemaLoc) -{ - XMLCh* locStr = schemaLoc; - XMLReader* curReader = fReaderMgr.getCurrentReader(); - - fLocationPairs->removeAllElements(); - while (*locStr) - { - do { - // Do we have an escaped character ? - if (*locStr == 0xFFFF) - continue; - - if (!curReader->isWhitespace(*locStr)) - break; - - *locStr = chNull; - } while (*++locStr); - - if (*locStr) { - - fLocationPairs->addElement(locStr); - - while (*++locStr) { - // Do we have an escaped character ? - if (*locStr == 0xFFFF) - continue; - if (curReader->isWhitespace(*locStr)) - break; - } - } - } -} - -void IGXMLScanner::endElementPSVI(SchemaElementDecl* const elemDecl, - DatatypeValidator* const memberDV) -{ - PSVIElement::ASSESSMENT_TYPE validationAttempted; - PSVIElement::VALIDITY_STATE validity = PSVIElement::VALIDITY_NOTKNOWN; - - if (fPSVIElemContext.fElemDepth > fPSVIElemContext.fFullValidationDepth) - validationAttempted = PSVIElement::VALIDATION_FULL; - else if (fPSVIElemContext.fElemDepth > fPSVIElemContext.fNoneValidationDepth) - validationAttempted = PSVIElement::VALIDATION_NONE; - else - { - validationAttempted = PSVIElement::VALIDATION_PARTIAL; - fPSVIElemContext.fFullValidationDepth = - fPSVIElemContext.fNoneValidationDepth = fPSVIElemContext.fElemDepth - 1; - } - - if (fValidate && elemDecl->isDeclared()) - { - validity = (fPSVIElemContext.fErrorOccurred) - ? PSVIElement::VALIDITY_INVALID : PSVIElement::VALIDITY_VALID; - } - - XSTypeDefinition* typeDef = 0; - bool isMixed = false; - if (fPSVIElemContext.fCurrentTypeInfo) - { - typeDef = (XSTypeDefinition*) fModel->getXSObject(fPSVIElemContext.fCurrentTypeInfo); - SchemaElementDecl::ModelTypes modelType = (SchemaElementDecl::ModelTypes)fPSVIElemContext.fCurrentTypeInfo->getContentType(); - isMixed = (modelType == SchemaElementDecl::Mixed_Simple - || modelType == SchemaElementDecl::Mixed_Complex); - } - else if (fPSVIElemContext.fCurrentDV) - typeDef = (XSTypeDefinition*) fModel->getXSObject(fPSVIElemContext.fCurrentDV); - - XMLCh* canonicalValue = 0; - if (fPSVIElemContext.fNormalizedValue && !isMixed && - validity == PSVIElement::VALIDITY_VALID) - { - if (memberDV) - canonicalValue = (XMLCh*) memberDV->getCanonicalRepresentation(fPSVIElemContext.fNormalizedValue, fMemoryManager); - else if (fPSVIElemContext.fCurrentDV) - canonicalValue = (XMLCh*) fPSVIElemContext.fCurrentDV->getCanonicalRepresentation(fPSVIElemContext.fNormalizedValue, fMemoryManager); - } - - fPSVIElement->reset - ( - validity - , validationAttempted - , fRootElemName - , fPSVIElemContext.fIsSpecified - , (elemDecl->isDeclared()) - ? (XSElementDeclaration*) fModel->getXSObject(elemDecl) : 0 - , typeDef - , (memberDV) ? (XSSimpleTypeDefinition*) fModel->getXSObject(memberDV) : 0 - , fModel - , elemDecl->getDefaultValue() - , fPSVIElemContext.fNormalizedValue - , canonicalValue - ); - - fPSVIHandler->handleElementPSVI - ( - elemDecl->getBaseName() - , fURIStringPool->getValueForId(elemDecl->getURI()) - , fPSVIElement - ); - - // decrease element depth - fPSVIElemContext.fElemDepth--; - -} - -void IGXMLScanner::resetPSVIElemContext() -{ - fPSVIElemContext.fIsSpecified = false; - fPSVIElemContext.fErrorOccurred = false; - fPSVIElemContext.fElemDepth = -1; - fPSVIElemContext.fFullValidationDepth = -1; - fPSVIElemContext.fNoneValidationDepth = -1; - fPSVIElemContext.fCurrentDV = 0; - fPSVIElemContext.fCurrentTypeInfo = 0; - fPSVIElemContext.fNormalizedValue = 0; -} - -} From a81c2f936071987375e1daedf26e543f4ac09295 Mon Sep 17 00:00:00 2001 From: johnjamesmccann <98098904+johnjamesmccann@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:57:43 +0000 Subject: [PATCH 4/5] Update DGXMLScanner.cpp --- src/xercesc/internal/DGXMLScanner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xercesc/internal/DGXMLScanner.cpp b/src/xercesc/internal/DGXMLScanner.cpp index 38e58f49a..61fbb91e6 100644 --- a/src/xercesc/internal/DGXMLScanner.cpp +++ b/src/xercesc/internal/DGXMLScanner.cpp @@ -19,6 +19,8 @@ * $Id$ */ +// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens +// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. // --------------------------------------------------------------------------- // Includes @@ -1052,7 +1054,6 @@ void DGXMLScanner::scanDocTypeDecl() DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); declDTD->setSystemId(sysId); declDTD->setIsExternal(true); - Janitor janDecl(declDTD); // Mark this one as a throw at end reader->setThrowAtEnd(true); @@ -2131,7 +2132,6 @@ Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src, DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); declDTD->setSystemId(src.getSystemId()); declDTD->setIsExternal(true); - Janitor janDecl(declDTD); // Mark this one as a throw at end newReader->setThrowAtEnd(true); From ce6643acbce5108c39dfe2392a96d53e6b37fecd Mon Sep 17 00:00:00 2001 From: johnjamesmccann <98098904+johnjamesmccann@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:59:07 +0000 Subject: [PATCH 5/5] Update IGXMLScanner.cpp --- src/xercesc/internal/IGXMLScanner.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/xercesc/internal/IGXMLScanner.cpp b/src/xercesc/internal/IGXMLScanner.cpp index 4417d44a0..eb99f2cdf 100644 --- a/src/xercesc/internal/IGXMLScanner.cpp +++ b/src/xercesc/internal/IGXMLScanner.cpp @@ -19,6 +19,9 @@ * $Id$ */ +// SPDX-FileCopyrightText: Portions Copyright 2021 Siemens +// Modified on 15-Jul-2021 by Siemens and/or its affiliates to fix CVE-2018-1311: Apache Xerces-C use-after-free vulnerability scanning external DTD. Copyright 2021 Siemens. + // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- @@ -1535,7 +1538,6 @@ void IGXMLScanner::scanDocTypeDecl() DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); declDTD->setSystemId(sysId); declDTD->setIsExternal(true); - Janitor janDecl(declDTD); // Mark this one as a throw at end reader->setThrowAtEnd(true); @@ -3098,7 +3100,6 @@ Grammar* IGXMLScanner::loadDTDGrammar(const InputSource& src, DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); declDTD->setSystemId(src.getSystemId()); declDTD->setIsExternal(true); - Janitor janDecl(declDTD); // Mark this one as a throw at end newReader->setThrowAtEnd(true);