fune/parser/htmlparser/nsExpatDriver.h
Peter Van der Beken 4641d9bef3 Bug 1751796 - XML parsererror eats two first letters. r=bholley
We were calling XML_GetCurrentColumnNumber after ParseBuffer caused Expat
to consume some data. XML_GetCurrentColumnNumber uses the buffer that was
last passed to Expat. Before Expat was put in an RLBox sandbox the caller
of ParseBuffer would keep the data in the scanner string until after the
call to XML_GetCurrentColumnNumber. Now that we copy the data into the
RLBox sandbox the data is freed when the TransferBuffer in ParseBuffer
goes out of scope, so in the caller of ParseBuffer the call to
XML_GetCurrentColumnNumber would cause us to read freed memory inside the
sandbox. Moving the call to XML_GetCurrentColumnNumber to inside
ParseBuffer, when TransferBuffer is still in scope, solves the issue.

Differential Revision: https://phabricator.services.mozilla.com/D141795
2022-04-05 14:10:11 +00:00

255 lines
11 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef NS_EXPAT_DRIVER__
#define NS_EXPAT_DRIVER__
#include "expat_config.h"
#include "expat.h"
#include "nsCOMPtr.h"
#include "nsString.h"
#include "nsIDTD.h"
#include "nsIInputStream.h"
#include "nsIParser.h"
#include "nsCycleCollectionParticipant.h"
#include "rlbox_expat.h"
#include "nsRLBoxExpatDriver.h"
#include "mozilla/UniquePtr.h"
class nsIExpatSink;
struct nsCatalogData;
class RLBoxExpatSandboxData;
namespace mozilla {
template <typename, size_t>
class Array;
}
class nsExpatDriver : public nsIDTD {
virtual ~nsExpatDriver();
public:
NS_DECL_CYCLE_COLLECTING_ISUPPORTS_FINAL
NS_DECL_NSIDTD
NS_DECL_CYCLE_COLLECTION_CLASS(nsExpatDriver)
nsExpatDriver();
nsresult Initialize(nsIURI* aURI, nsIContentSink* aSink);
nsresult ResumeParse(nsScanner& aScanner, bool aIsFinalChunk);
int HandleExternalEntityRef(const char16_t* aOpenEntityNames,
const char16_t* aBase, const char16_t* aSystemId,
const char16_t* aPublicId);
static void HandleStartElement(rlbox_sandbox_expat& aSandbox,
tainted_expat<void*> aUserData,
tainted_expat<const char16_t*> aName,
tainted_expat<const char16_t**> aAtts);
static void HandleStartElementForSystemPrincipal(
rlbox_sandbox_expat& aSandbox, tainted_expat<void*> aUserData,
tainted_expat<const char16_t*> aName,
tainted_expat<const char16_t**> aAtts);
static void HandleEndElement(rlbox_sandbox_expat& aSandbox,
tainted_expat<void*> aUserData,
tainted_expat<const char16_t*> aName);
static void HandleEndElementForSystemPrincipal(
rlbox_sandbox_expat& aSandbox, tainted_expat<void*> aUserData,
tainted_expat<const char16_t*> aName);
nsresult HandleCharacterData(const char16_t* aCData, const uint32_t aLength);
nsresult HandleComment(const char16_t* aName);
nsresult HandleProcessingInstruction(const char16_t* aTarget,
const char16_t* aData);
nsresult HandleXMLDeclaration(const char16_t* aVersion,
const char16_t* aEncoding, int32_t aStandalone);
nsresult HandleDefault(const char16_t* aData, const uint32_t aLength);
nsresult HandleStartCdataSection();
nsresult HandleEndCdataSection();
nsresult HandleStartDoctypeDecl(const char16_t* aDoctypeName,
const char16_t* aSysid,
const char16_t* aPubid,
bool aHasInternalSubset);
nsresult HandleEndDoctypeDecl();
private:
// Load up an external stream to get external entity information
nsresult OpenInputStreamFromExternalDTD(const char16_t* aFPIStr,
const char16_t* aURLStr,
nsIURI* aBaseURI,
nsIInputStream** aStream,
nsIURI** aAbsURI);
enum class ChunkOrBufferIsFinal {
None,
FinalChunk,
FinalChunkAndBuffer,
};
/**
* Pass a buffer to Expat. If Expat is blocked aBuffer should be null and
* aLength should be 0. The result of the call will be stored in
* mInternalState. Expat will parse as much of the buffer as it can and store
* the rest in its internal buffer.
*
* @param aBuffer the buffer to pass to Expat. May be null.
* @param aLength the length of the buffer to pass to Expat (in number of
* char16_t's). Must be 0 if aBuffer is null and > 0 if
* aBuffer is not null.
* @param aIsFinal whether this is the last chunk in a row passed to
* ParseChunk, and if so whether it's the last chunk and
* buffer passed to ParseChunk (meaning there will be no more
* calls to ParseChunk for the document being parsed).
* @param aConsumed [out] the number of PRUnichars that Expat consumed. This
* doesn't include the PRUnichars that Expat stored in
* its buffer but didn't parse yet.
* @param aLastLineLength [out] the length of the last line that Expat has
* consumed. This will only be computed if
* aIsFinal is not None or mInternalState is set
* to a failure.
*/
void ParseChunk(const char16_t* aBuffer, uint32_t aLength,
ChunkOrBufferIsFinal aIsFinal, uint32_t* aConsumed,
XML_Size* aLastLineLength);
/**
* Wrapper for ParseBuffer. If the buffer is too large to be copied into the
* sandbox all at once, splits it into chunks and invokes ParseBuffer in a
* loop.
*
* @param aBuffer the buffer to pass to Expat. May be null.
* @param aLength the length of the buffer to pass to Expat (in number of
* char16_t's). Must be 0 if aBuffer is null and > 0 if
* aBuffer is not null.
* @param aIsFinal whether there will definitely not be any more new buffers
* passed in to ParseBuffer
* @param aConsumed [out] the number of PRUnichars that Expat consumed. This
* doesn't include the PRUnichars that Expat stored in
* its buffer but didn't parse yet.
* @param aLastLineLength [out] the length of the last line that Expat has
* consumed.
*/
void ChunkAndParseBuffer(const char16_t* aBuffer, uint32_t aLength,
bool aIsFinal, uint32_t* aPassedToExpat,
uint32_t* aConsumed, XML_Size* aLastLineLength);
nsresult HandleError();
void MaybeStopParser(nsresult aState);
bool BlockedOrInterrupted() {
return mInternalState == NS_ERROR_HTMLPARSER_BLOCK ||
mInternalState == NS_ERROR_HTMLPARSER_INTERRUPTED;
}
// Expat allows us to set the base URI for entities. It doesn't use the base
// URI itself, but just passes it along to all the entity handlers (just the
// external entity reference handler for us). It does expect the base URI as a
// null-terminated string, with the same character type as the parsed buffers
// (char16_t in our case). Because nsIURI stores a UTF-8 string we have to do
// a conversion to UTF-16 for Expat. We also RLBox the Expat parser, so we
// also do 2 copies (into RLBox sandbox, and Expat does a copy into its pool).
// Most of the time this base URI is unused (the external entity handler is
// rarely called), but when it is we also convert it back to a nsIURI, so we
// convert the string back to UTF-8.
//
// We'd rather not do any of these conversions and copies, so we use a (hacky)
// workaround. We store all base URIs in an array of nsIURIs. Instead of
// passing the real URI to Expat as a string, we pass it a null-terminated
// 2-character buffer. The first character of that buffer stores the index of
// the corresponding nsIURI in the array (incremented with 1 because 0 is used
// to terminate a string). The entity handler can then use the index from the
// base URI that Expat passes it to look up the right nsIURI from the array.
//
// GetExpatBaseURI pushes the nsIURI to the array, and creates the
// two-character buffer for it.
//
// GetBaseURI looks up the right nsIURI in the array, based on the index from
// the two-character buffer.
using ExpatBaseURI = mozilla::Array<XML_Char, 2>;
ExpatBaseURI GetExpatBaseURI(nsIURI* aURI);
nsIURI* GetBaseURI(const XML_Char* aBase) const;
RLBoxExpatSandboxData* SandboxData() const;
rlbox_sandbox_expat* Sandbox() const;
// Destroy expat parser and return sandbox to pool
void Destroy();
mozilla::UniquePtr<mozilla::RLBoxSandboxPoolData> mSandboxPoolData;
tainted_expat<XML_Parser> mExpatParser;
nsString mLastLine;
nsString mCDataText;
// Various parts of a doctype
nsString mDoctypeName;
nsString mSystemID;
nsString mPublicID;
nsString mInternalSubset;
bool mInCData;
bool mInInternalSubset;
bool mInExternalDTD;
bool mMadeFinalCallToExpat;
// Used to track if we're in the parser.
bool mInParser;
nsresult mInternalState;
// The length of the data in Expat's buffer (in number of PRUnichars).
uint32_t mExpatBuffered;
uint16_t mTagDepth;
// These sinks all refer the same conceptual object. mOriginalSink is
// identical with the nsIContentSink* passed to WillBuildModel, and exists
// only to avoid QI-ing back to nsIContentSink*.
nsCOMPtr<nsIContentSink> mOriginalSink;
nsCOMPtr<nsIExpatSink> mSink;
const nsCatalogData* mCatalogData; // weak
nsTArray<nsCOMPtr<nsIURI>> mURIs;
// Used for error reporting.
uint64_t mInnerWindowID;
};
class RLBoxExpatSandboxData : public mozilla::RLBoxSandboxDataBase {
friend class RLBoxExpatSandboxPool;
friend class nsExpatDriver;
public:
explicit RLBoxExpatSandboxData(uint64_t aSize)
: mozilla::RLBoxSandboxDataBase(aSize) {
MOZ_COUNT_CTOR(RLBoxExpatSandboxData);
}
~RLBoxExpatSandboxData();
rlbox_sandbox_expat* Sandbox() const { return mSandbox.get(); }
// After getting a sandbox from the pool we need to register the
// Handle{Start,End}Element callbacks and associate the driver with the
// sandbox.
void AttachDriver(bool IsSystemPrincipal, void* aDriver);
void DetachDriver();
private:
mozilla::UniquePtr<rlbox_sandbox_expat> mSandbox;
// Common expat callbacks that persist across calls to {Attach,Detach}Driver,
// and consequently across sandbox reuses.
sandbox_callback_expat<XML_XmlDeclHandler> mHandleXMLDeclaration;
sandbox_callback_expat<XML_CharacterDataHandler> mHandleCharacterData;
sandbox_callback_expat<XML_ProcessingInstructionHandler>
mHandleProcessingInstruction;
sandbox_callback_expat<XML_DefaultHandler> mHandleDefault;
sandbox_callback_expat<XML_ExternalEntityRefHandler> mHandleExternalEntityRef;
sandbox_callback_expat<XML_CommentHandler> mHandleComment;
sandbox_callback_expat<XML_StartCdataSectionHandler> mHandleStartCdataSection;
sandbox_callback_expat<XML_EndCdataSectionHandler> mHandleEndCdataSection;
sandbox_callback_expat<XML_StartDoctypeDeclHandler> mHandleStartDoctypeDecl;
sandbox_callback_expat<XML_EndDoctypeDeclHandler> mHandleEndDoctypeDecl;
// Expat callbacks specific to each driver, and thus (re)set across sandbox
// reuses.
sandbox_callback_expat<XML_StartElementHandler> mHandleStartElement;
sandbox_callback_expat<XML_EndElementHandler> mHandleEndElement;
};
#endif