Commit b45fdb34 authored by Florian Sattler's avatar Florian Sattler Committed by Commit Bot

[scanner] Adding AdvanceUntil to Utf16CharacterStream

AdvanceUntil allows the Utf16CharacterStream to advance until a charater is found
that passes the check.

Bug: v8:7926
Change-Id: Iae39fb24194aa0ee2f544a55a7847956aa324b64
Reviewed-on: https://chromium-review.googlesource.com/1151303
Commit-Queue: Florian Sattler <sattlerf@google.com>
Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54783}
parent b91a3d9e
......@@ -468,21 +468,16 @@ Token::Value Scanner::SkipSingleHTMLComment() {
}
Token::Value Scanner::SkipSingleLineComment() {
Advance();
// The line terminator at the end of the line is not considered
// to be part of the single-line comment; it is recognized
// separately by the lexical grammar and becomes part of the
// stream of input elements for the syntactic grammar (see
// ECMA-262, section 7.4).
while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
Advance();
}
AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
return Token::WHITESPACE;
}
Token::Value Scanner::SkipSourceURLComment() {
TryToParseSourceURLComment();
while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
......
......@@ -7,6 +7,8 @@
#ifndef V8_PARSING_SCANNER_H_
#define V8_PARSING_SCANNER_H_
#include <algorithm>
#include "src/allocation.h"
#include "src/base/logging.h"
#include "src/char-predicates.h"
......@@ -55,6 +57,31 @@ class Utf16CharacterStream {
}
}
// Returns and advances past the next UTF-16 code unit in the input stream
// that meets the checks requirement. If there are no more code units it
// returns kEndOfInput.
template <typename FunctionType>
V8_INLINE uc32 AdvanceUntil(FunctionType check) {
while (true) {
auto next_cursor_pos =
std::find_if(buffer_cursor_, buffer_end_, [&check](uint16_t raw_c0_) {
uc32 c0_ = static_cast<uc32>(raw_c0_);
return check(c0_);
});
if (next_cursor_pos == buffer_end_) {
buffer_cursor_ = buffer_end_;
if (!ReadBlockChecked()) {
buffer_cursor_++;
return kEndOfInput;
}
} else {
buffer_cursor_ = next_cursor_pos + 1;
return static_cast<uc32>(*next_cursor_pos);
}
}
}
// Go back one by one character in the input stream.
// This undoes the most recent Advance().
inline void Back() {
......@@ -616,6 +643,11 @@ class Scanner {
c0_ = source_->Advance();
}
template <typename FunctionType>
V8_INLINE void AdvanceUntil(FunctionType check) {
c0_ = source_->AdvanceUntil(check);
}
bool CombineSurrogatePair() {
DCHECK(!unibrow::Utf16::IsLeadSurrogate(kEndOfInput));
if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
......
......@@ -185,6 +185,97 @@ TEST(Utf8SplitBOM) {
}
}
TEST(Utf8AdvanceUntil) {
// Test utf-8 advancing until a certain char.
const char line_term = '\n';
const size_t kLen = arraysize(unicode_utf8);
char data[kLen + 1];
strncpy(data, unicode_utf8, kLen);
data[kLen - 1] = line_term;
data[kLen] = '\0';
{
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
int32_t res = stream->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
CHECK_EQ(line_term, res);
}
}
TEST(AdvanceMatchAdvanceUntil) {
// Test if single advance and advanceUntil behave the same
char data[] = {'a', 'b', '\n', 'c', '\0'};
{
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source_a(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance(
v8::internal::ScannerStream::For(
&chunk_source_a, v8::ScriptCompiler::StreamedSource::UTF8,
nullptr));
ChunkSource chunk_source_au(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance_until(
v8::internal::ScannerStream::For(
&chunk_source_au, v8::ScriptCompiler::StreamedSource::UTF8,
nullptr));
int32_t au_c0_ = stream_advance_until->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
int32_t a_c0_ = '0';
while (!unibrow::IsLineTerminator(a_c0_)) {
a_c0_ = stream_advance->Advance();
}
// Check both advances methods have the same output
CHECK_EQ(a_c0_, au_c0_);
// Check if both set the cursor to the correct position by advancing both
// streams by one character.
a_c0_ = stream_advance->Advance();
au_c0_ = stream_advance_until->Advance();
CHECK_EQ(a_c0_, au_c0_);
}
}
TEST(Utf8AdvanceUntilOverChunkBoundaries) {
// Test utf-8 advancing until a certain char, crossing chunk boundaries.
// Split the test string at each byte and pass it to the stream. This way,
// we'll have a split at each possible boundary.
size_t len = strlen(unicode_utf8);
char buffer[arraysize(unicode_utf8) + 4];
for (size_t i = 1; i < len; i++) {
// Copy source string into buffer, splitting it at i.
// Then add three chunks, 0..i-1, i..strlen-1, empty.
strncpy(buffer, unicode_utf8, i);
strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
buffer[i] = '\0';
buffer[len + 1] = '\n';
buffer[len + 2] = '\0';
buffer[len + 3] = '\0';
const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
int32_t res = stream->AdvanceUntil(
[](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
CHECK_EQ(buffer[len + 1], res);
}
}
TEST(Utf8ChunkBoundaries) {
// Test utf-8 parsing at chunk boundaries.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment