From 44b22460e5046eaa2452e16771b45c74b401e1ea Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 16 Mar 2023 16:44:17 +0000 Subject: [PATCH] Bug 1552008 - Track column number in the HTML. r=smaug,nchevobbe Differential Revision: https://phabricator.services.mozilla.com/D170579 --- parser/html/javasrc/Tokenizer.java | 11 ++++ parser/html/nsHtml5Speculation.cpp | 2 + parser/html/nsHtml5Speculation.h | 9 ++- parser/html/nsHtml5StreamParser.cpp | 11 ++-- parser/html/nsHtml5Tokenizer.cpp | 3 + parser/html/nsHtml5Tokenizer.h | 12 ---- parser/html/nsHtml5TokenizerHSupplement.h | 57 +++++++++++++++++++ .../tests/mochitest/test_bug672453.html | 2 +- 8 files changed, 89 insertions(+), 18 deletions(-) diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java index a197757a7e5d..a8047c7a71b6 100644 --- a/parser/html/javasrc/Tokenizer.java +++ b/parser/html/javasrc/Tokenizer.java @@ -1390,6 +1390,9 @@ public class Tokenizer implements Locator, Locator2 { public void start() throws SAXException { initializeWithoutStarting(); tokenHandler.startTokenization(this); + // CPPONLY: line = 0; + // CPPONLY: col = 1; + // CPPONLY: nextCharOnNewLine = true; // [NOCPP[ startErrorReporting(); // ]NOCPP] @@ -6328,6 +6331,8 @@ public class Tokenizer implements Locator, Locator2 { appendStrBuf('\n'); } + // [NOCPP[ + @Inline protected void silentCarriageReturn() { ++line; lastCR = true; @@ -6337,6 +6342,8 @@ public class Tokenizer implements Locator, Locator2 { ++line; } + // ]NOCPP] + private void emitCarriageReturn(@NoLength char[] buf, int pos) throws SAXException { silentCarriageReturn(); @@ -7154,11 +7161,15 @@ public class Tokenizer implements Locator, Locator2 { return suspendAfterCurrentNonTextToken; } + // [NOCPP[ + @Inline protected char checkChar(@NoLength char[] buf, int pos) throws SAXException { return buf[pos]; } + // ]NOCPP] + public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { if (encodingDeclarationHandler != null) { diff --git a/parser/html/nsHtml5Speculation.cpp b/parser/html/nsHtml5Speculation.cpp index 07796e77b80d..0cf7816ff6d4 100644 --- a/parser/html/nsHtml5Speculation.cpp +++ b/parser/html/nsHtml5Speculation.cpp @@ -8,10 +8,12 @@ using namespace mozilla; nsHtml5Speculation::nsHtml5Speculation(nsHtml5OwningUTF16Buffer* aBuffer, int32_t aStart, int32_t aStartLineNumber, + int32_t aStartColumnNumber, nsAHtml5TreeBuilderState* aSnapshot) : mBuffer(aBuffer), mStart(aStart), mStartLineNumber(aStartLineNumber), + mStartColumnNumber(aStartColumnNumber), mSnapshot(aSnapshot) { MOZ_COUNT_CTOR(nsHtml5Speculation); } diff --git a/parser/html/nsHtml5Speculation.h b/parser/html/nsHtml5Speculation.h index 9e1b2c31f586..9bd04a7c1923 100644 --- a/parser/html/nsHtml5Speculation.h +++ b/parser/html/nsHtml5Speculation.h @@ -16,7 +16,7 @@ class nsHtml5Speculation final : public nsAHtml5TreeOpSink { public: nsHtml5Speculation(nsHtml5OwningUTF16Buffer* aBuffer, int32_t aStart, - int32_t aStartLineNumber, + int32_t aStartLineNumber, int32_t aStartColumnNumber, nsAHtml5TreeBuilderState* aSnapshot); ~nsHtml5Speculation(); @@ -27,6 +27,8 @@ class nsHtml5Speculation final : public nsAHtml5TreeOpSink { int32_t GetStartLineNumber() { return mStartLineNumber; } + int32_t GetStartColumnNumber() { return mStartColumnNumber; } + nsAHtml5TreeBuilderState* GetSnapshot() { return mSnapshot.get(); } /** @@ -54,6 +56,11 @@ class nsHtml5Speculation final : public nsAHtml5TreeOpSink { */ int32_t mStartLineNumber; + /** + * The current line number at the start of the speculation. + */ + int32_t mStartColumnNumber; + mozilla::UniquePtr mSnapshot; nsTArray mOpQueue; diff --git a/parser/html/nsHtml5StreamParser.cpp b/parser/html/nsHtml5StreamParser.cpp index 94ef10b35b99..6af0b8d4d732 100644 --- a/parser/html/nsHtml5StreamParser.cpp +++ b/parser/html/nsHtml5StreamParser.cpp @@ -791,7 +791,7 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(Span aFromSegment, mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); nsHtml5Speculation* speculation = new nsHtml5Speculation( mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), - mTreeBuilder->newSnapshot()); + mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); MOZ_ASSERT(!mFlushTimerArmed, "How did we end up arming the timer?"); if (mMode == VIEW_SOURCE_HTML) { mTokenizer->SetViewSourceOpSink(speculation); @@ -1999,7 +1999,7 @@ void nsHtml5StreamParser::DiscardMetaSpeculation() { nsHtml5Speculation* speculation = new nsHtml5Speculation( mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), - mTreeBuilder->newSnapshot()); + mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); MOZ_ASSERT(!mFlushTimerArmed, "How did we end up arming the timer?"); if (mMode == VIEW_SOURCE_HTML) { mTokenizer->SetViewSourceOpSink(speculation); @@ -2486,7 +2486,7 @@ void nsHtml5StreamParser::ParseAvailableData() { mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); nsHtml5Speculation* speculation = new nsHtml5Speculation( mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), - mTreeBuilder->newSnapshot()); + mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(), speculation->GetStartLineNumber()); if (mLookingForMetaCharset) { @@ -2649,12 +2649,15 @@ void nsHtml5StreamParser::ContinueAfterScriptsOrEncodingCommitment( mFirstBuffer = speculation->GetBuffer(); mFirstBuffer->setStart(speculation->GetStart()); mTokenizer->setLineNumber(speculation->GetStartLineNumber()); + mTokenizer->setColumnNumberAndResetNextLine( + speculation->GetStartColumnNumber()); nsContentUtils::ReportToConsole( nsIScriptError::warningFlag, "DOM Events"_ns, mExecutor->GetDocument(), nsContentUtils::eDOM_PROPERTIES, "SpeculationFailed2", nsTArray(), nullptr, u""_ns, - speculation->GetStartLineNumber()); + speculation->GetStartLineNumber(), + speculation->GetStartColumnNumber()); nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next; while (buffer) { diff --git a/parser/html/nsHtml5Tokenizer.cpp b/parser/html/nsHtml5Tokenizer.cpp index 050a8164638d..b5de5a926663 100644 --- a/parser/html/nsHtml5Tokenizer.cpp +++ b/parser/html/nsHtml5Tokenizer.cpp @@ -400,6 +400,9 @@ void nsHtml5Tokenizer::addAttributeWithValue() { void nsHtml5Tokenizer::start() { initializeWithoutStarting(); tokenHandler->startTokenization(this); + line = 0; + col = 1; + nextCharOnNewLine = true; } bool nsHtml5Tokenizer::tokenizeBuffer(nsHtml5UTF16Buffer* buffer) { diff --git a/parser/html/nsHtml5Tokenizer.h b/parser/html/nsHtml5Tokenizer.h index d639f780c50f..7397bb8c0392 100644 --- a/parser/html/nsHtml5Tokenizer.h +++ b/parser/html/nsHtml5Tokenizer.h @@ -429,15 +429,6 @@ class nsHtml5Tokenizer { appendStrBuf('\n'); } - protected: - inline void silentCarriageReturn() { - ++line; - lastCR = true; - } - - inline void silentLineFeed() { ++line; } - - private: void emitCarriageReturn(char16_t* buf, int32_t pos); void emitReplacementCharacter(char16_t* buf, int32_t pos); void maybeEmitReplacementCharacter(char16_t* buf, int32_t pos); @@ -456,9 +447,6 @@ class nsHtml5Tokenizer { void suspendAfterCurrentTokenIfNotInText(); bool suspensionAfterCurrentNonTextTokenPending(); - protected: - inline char16_t checkChar(char16_t* buf, int32_t pos) { return buf[pos]; } - public: bool internalEncodingDeclaration(nsHtml5String internalCharset); diff --git a/parser/html/nsHtml5TokenizerHSupplement.h b/parser/html/nsHtml5TokenizerHSupplement.h index 5fd2451de8f0..0e1930b93e6d 100644 --- a/parser/html/nsHtml5TokenizerHSupplement.h +++ b/parser/html/nsHtml5TokenizerHSupplement.h @@ -2,6 +2,63 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +private: +inline void silentCarriageReturn() { + nextCharOnNewLine = true; + lastCR = true; +} + +inline void silentLineFeed() { nextCharOnNewLine = true; } + +inline char16_t checkChar(char16_t* buf, int32_t pos) { + // The name of this method comes from the validator. + // We aren't checking a char here. We read the next + // UTF-16 code unit and, before returning it, adjust + // the line and column numbers. + char16_t c = buf[pos]; + if (MOZ_UNLIKELY(nextCharOnNewLine)) { + // By changing the line and column here instead + // of doing so eagerly when seeing the line break + // causes the line break itself to be considered + // column-wise at the end of a line. + line++; + col = 1; + nextCharOnNewLine = false; + } else if (MOZ_LIKELY(!NS_IS_LOW_SURROGATE(c))) { + // SpiderMonkey wants to count scalar values + // instead of UTF-16 code units. We omit low + // surrogates from the count so that only the + // high surrogate increments the count for + // two-code-unit scalar values. + // + // It's somewhat questionable from the performance + // perspective to make the human-perceivable column + // count correct for non-BMP characters in the case + // where there is a single scalar value per extended + // grapheme cluster when even on the BMP there are + // various cases where the scalar count doesn't make + // much sense as a human-perceived "column count" due + // to extended grapheme clusters consisting of more + // than one scalar value. + col++; + } + return c; +} + +int32_t col; +bool nextCharOnNewLine; + +public: +inline int32_t getColumnNumber() { return col; } + +inline void setColumnNumberAndResetNextLine(int32_t aCol) { + col = aCol; + // The restored position only ever points to the position of + // script tag's > character, so we can unconditionally use + // `false` below. + nextCharOnNewLine = false; +} + inline nsHtml5HtmlAttributes* GetAttributes() { return attributes; } /** diff --git a/parser/htmlparser/tests/mochitest/test_bug672453.html b/parser/htmlparser/tests/mochitest/test_bug672453.html index 1641d3660126..312757cb58a8 100644 --- a/parser/htmlparser/tests/mochitest/test_bug672453.html +++ b/parser/htmlparser/tests/mochitest/test_bug672453.html @@ -104,7 +104,7 @@ var expectedErrors = [ isWarning: true }, { errorMessage: "The start of the document was reparsed, because there were non-ASCII characters in the part of the document that was unsuccessfully searched for a meta tag before falling back to the XML declaration syntax. A meta tag at the start of the head part should be used instead of the XML declaration syntax.", sourceName: "http://mochi.test:8888/tests/parser/htmlparser/tests/mochitest/file_bug672453_xml_speculation_fail.html", - lineNumber: 11, + lineNumber: 10, isWarning: true }, ];